1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
15 "golang.org/x/text/internal/tag"
18 // isAlpha returns true if the byte is not a digit.
19 // b must be an ASCII letter or digit.
20 func isAlpha(b byte) bool {
24 // isAlphaNum returns true if the string contains only ASCII letters or digits.
25 func isAlphaNum(s []byte) bool {
27 if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
34 // errSyntax is returned by any of the parsing functions when the
35 // input is not well-formed, according to BCP 47.
36 // TODO: return the position at which the syntax error occurred?
37 var errSyntax = errors.New("language: tag is not well-formed")
39 // ValueError is returned by any of the parsing functions when the
40 // input is well-formed but the respective subtag is not recognized
42 type ValueError struct {
46 func mkErrInvalid(s []byte) error {
52 func (e ValueError) tag() []byte {
53 n := bytes.IndexByte(e.v[:], 0)
60 // Error implements the error interface.
61 func (e ValueError) Error() string {
62 return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
65 // Subtag returns the subtag for which the error occurred.
66 func (e ValueError) Subtag() string {
67 return string(e.tag())
70 // scanner is used to scan BCP 47 tokens, which are separated by _ or -.
73 bytes [max99thPercentileSize]byte
75 start int // start position of the current token
76 end int // end position of the current token
77 next int // next point for scan
82 func makeScannerString(s string) scanner {
84 if len(s) <= len(scan.bytes) {
85 scan.b = scan.bytes[:copy(scan.bytes[:], s)]
93 // makeScanner returns a scanner using b as the input buffer.
94 // b is not copied and may be modified by the scanner routines.
95 func makeScanner(b []byte) scanner {
101 func (s *scanner) init() {
102 for i, c := range s.b {
110 // restToLower converts the string between start and end to lower case.
111 func (s *scanner) toLower(start, end int) {
112 for i := start; i < end; i++ {
114 if 'A' <= c && c <= 'Z' {
120 func (s *scanner) setError(e error) {
121 if s.err == nil || (e == errSyntax && s.err != errSyntax) {
126 // resizeRange shrinks or grows the array at position oldStart such that
127 // a new string of size newSize can fit between oldStart and oldEnd.
128 // Sets the scan point to after the resized range.
129 func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
131 if end := oldStart + newSize; end != oldEnd {
134 b := make([]byte, len(s.b)+diff)
135 copy(b, s.b[:oldStart])
136 copy(b[end:], s.b[oldEnd:])
139 s.b = append(s.b[end:], s.b[oldEnd:]...)
141 s.next = end + (s.next - s.end)
146 // replace replaces the current token with repl.
147 func (s *scanner) replace(repl string) {
148 s.resizeRange(s.start, s.end, len(repl))
149 copy(s.b[s.start:], repl)
152 // gobble removes the current token from the input.
153 // Caller must call scan after calling gobble.
154 func (s *scanner) gobble(e error) {
157 s.b = s.b[:+copy(s.b, s.b[s.next:])]
160 s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
166 // deleteRange removes the given range from s.b before the current token.
167 func (s *scanner) deleteRange(start, end int) {
168 s.setError(errSyntax)
169 s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
176 // scan parses the next token of a BCP 47 string. Tokens that are larger
177 // than 8 characters or include non-alphanumeric characters result in an error
178 // and are gobbled and removed from the output.
179 // It returns the end position of the last token consumed.
180 func (s *scanner) scan() (end int) {
183 for s.start = s.next; s.next < len(s.b); {
184 i := bytes.IndexByte(s.b[s.next:], '-')
193 token := s.b[s.start:s.end]
194 if i < 1 || i > 8 || !isAlphaNum(token) {
201 if n := len(s.b); n > 0 && s.b[n-1] == '-' {
202 s.setError(errSyntax)
203 s.b = s.b[:len(s.b)-1]
209 // acceptMinSize parses multiple tokens of the given size or greater.
210 // It returns the end position of the last token consumed.
211 func (s *scanner) acceptMinSize(min int) (end int) {
214 for ; len(s.token) >= min; s.scan() {
220 // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
221 // failed it returns an error and any part of the tag that could be parsed.
222 // If parsing succeeded but an unknown value was found, it returns
223 // ValueError. The Tag returned in this case is just stripped of the unknown
224 // value. All other values are preserved. It accepts tags in the BCP 47 format
225 // and extensions to this standard defined in
226 // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
227 // The resulting tag is canonicalized using the default canonicalization type.
228 func Parse(s string) (t Tag, err error) {
229 return Default.Parse(s)
232 // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
233 // failed it returns an error and any part of the tag that could be parsed.
234 // If parsing succeeded but an unknown value was found, it returns
235 // ValueError. The Tag returned in this case is just stripped of the unknown
236 // value. All other values are preserved. It accepts tags in the BCP 47 format
237 // and extensions to this standard defined in
238 // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
239 // The resulting tag is canonicalized using the the canonicalization type c.
240 func (c CanonType) Parse(s string) (t Tag, err error) {
241 // TODO: consider supporting old-style locale key-value pairs.
243 return und, errSyntax
245 if len(s) <= maxAltTaglen {
246 b := [maxAltTaglen]byte{}
247 for i, c := range s {
248 // Generating invalid UTF-8 is okay as it won't match.
249 if 'A' <= c && c <= 'Z' {
256 if t, ok := grandfathered(b); ok {
260 scan := makeScannerString(s)
261 t, err = parse(&scan, s)
262 t, changed := t.canonicalize(c)
269 func parse(scan *scanner, s string) (t Tag, err error) {
272 if n := len(scan.token); n <= 1 {
273 scan.toLower(0, len(scan.b))
274 if n == 0 || scan.token[0] != 'x' {
277 end = parseExtensions(scan)
279 return und, errSyntax
280 } else { // the usual case
281 t, end = parseTag(scan)
282 if n := len(scan.token); n == 1 {
284 end = parseExtensions(scan)
285 } else if end < len(scan.b) {
286 scan.setError(errSyntax)
287 scan.b = scan.b[:end]
290 if int(t.pVariant) < len(scan.b) {
294 if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
297 t.str = string(scan.b)
300 t.pVariant, t.pExt = 0, 0
305 // parseTag parses language, script, region and variants.
306 // It returns a Tag and the end position in the input that was parsed.
307 func parseTag(scan *scanner) (t Tag, end int) {
309 // TODO: set an error if an unknown lang, script or region is encountered.
310 t.lang, e = getLangID(scan.token)
312 scan.replace(t.lang.String())
313 langStart := scan.start
315 for len(scan.token) == 3 && isAlpha(scan.token[0]) {
316 // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
317 // to a tag of the form <extlang>.
318 lang, e := getLangID(scan.token)
321 copy(scan.b[langStart:], lang.String())
322 scan.b[langStart+3] = '-'
323 scan.start = langStart + 4
328 if len(scan.token) == 4 && isAlpha(scan.token[0]) {
329 t.script, e = getScriptID(script, scan.token)
335 if n := len(scan.token); n >= 2 && n <= 3 {
336 t.region, e = getRegionID(scan.token)
340 scan.replace(t.region.String())
344 scan.toLower(scan.start, len(scan.b))
345 t.pVariant = byte(end)
346 end = parseVariants(scan, end, t)
351 var separator = []byte{'-'}
353 // parseVariants scans tokens as long as each token is a valid variant string.
354 // Duplicate variants are removed.
355 func parseVariants(scan *scanner, end int, t Tag) int {
357 varIDBuf := [4]uint8{}
358 variantBuf := [4][]byte{}
359 varID := varIDBuf[:0]
360 variant := variantBuf[:0]
363 for ; len(scan.token) >= 4; scan.scan() {
364 // TODO: measure the impact of needing this conversion and redesign
365 // the data structure if there is an issue.
366 v, ok := variantIndex[string(scan.token)]
369 // TODO: allow user-defined variants?
370 scan.gobble(mkErrInvalid(scan.token))
373 varID = append(varID, v)
374 variant = append(variant, scan.token)
380 // There is no legal combinations of more than 7 variants
381 // (and this is by no means a useful sequence).
382 const maxVariants = 8
383 if len(varID) > maxVariants {
391 sort.Sort(variantsSort{varID, variant})
393 for i, v := range varID {
396 // Remove duplicates.
400 variant[k] = variant[i]
404 if str := bytes.Join(variant[:k], separator); len(str) == 0 {
407 scan.resizeRange(start, end, len(str))
408 copy(scan.b[scan.start:], str)
415 type variantsSort struct {
420 func (s variantsSort) Len() int {
424 func (s variantsSort) Swap(i, j int) {
425 s.i[i], s.i[j] = s.i[j], s.i[i]
426 s.v[i], s.v[j] = s.v[j], s.v[i]
429 func (s variantsSort) Less(i, j int) bool {
430 return s.i[i] < s.i[j]
433 type bytesSort [][]byte
435 func (b bytesSort) Len() int {
439 func (b bytesSort) Swap(i, j int) {
440 b[i], b[j] = b[j], b[i]
443 func (b bytesSort) Less(i, j int) bool {
444 return bytes.Compare(b[i], b[j]) == -1
447 // parseExtensions parses and normalizes the extensions in the buffer.
448 // It returns the last position of scan.b that is part of any extension.
449 // It also trims scan.b to remove excess parts accordingly.
450 func parseExtensions(scan *scanner) int {
455 for len(scan.token) == 1 {
456 extStart := scan.start
458 end = parseExtension(scan)
459 extension := scan.b[extStart:end]
460 if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
461 scan.setError(errSyntax)
464 } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
465 scan.b = scan.b[:end]
467 } else if ext == 'x' {
471 exts = append(exts, extension)
473 sort.Sort(bytesSort(exts))
474 if len(private) > 0 {
475 exts = append(exts, private)
477 scan.b = scan.b[:start]
479 scan.b = append(scan.b, bytes.Join(exts, separator)...)
480 } else if start > 0 {
481 // Strip trailing '-'.
482 scan.b = scan.b[:start-1]
487 // parseExtension parses a single extension and returns the position of
488 // the extension end.
489 func parseExtension(scan *scanner) int {
490 start, end := scan.start, scan.end
491 switch scan.token[0] {
495 for last := []byte{}; len(scan.token) > 2; scan.scan() {
496 if bytes.Compare(scan.token, last) != -1 {
497 // Attributes are unsorted. Start over from scratch.
501 for scan.scan(); len(scan.token) > 2; scan.scan() {
502 attrs = append(attrs, scan.token)
505 sort.Sort(bytesSort(attrs))
506 copy(scan.b[p:], bytes.Join(attrs, separator))
513 for attrEnd := end; len(scan.token) == 2; last = key {
516 end = scan.acceptMinSize(3)
517 // TODO: check key value validity
518 if keyEnd == end || bytes.Compare(key, last) != 1 {
519 // We have an invalid key or the keys are not sorted.
520 // Start scanning keys from scratch and reorder.
524 for scan.scan(); len(scan.token) == 2; {
525 keyStart, keyEnd := scan.start, scan.end
526 end = scan.acceptMinSize(3)
528 keys = append(keys, scan.b[keyStart:end])
530 scan.setError(errSyntax)
534 sort.Sort(bytesSort(keys))
535 reordered := bytes.Join(keys, separator)
536 if e := p + len(reordered); e < end {
537 scan.deleteRange(e, end)
540 copy(scan.b[p:], bytes.Join(keys, separator))
546 if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
547 _, end = parseTag(scan)
548 scan.toLower(start, end)
550 for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
551 end = scan.acceptMinSize(3)
554 end = scan.acceptMinSize(1)
556 end = scan.acceptMinSize(2)
561 // Compose creates a Tag from individual parts, which may be of type Tag, Base,
562 // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
563 // Base, Script or Region or slice of type Variant or Extension is passed more
564 // than once, the latter will overwrite the former. Variants and Extensions are
565 // accumulated, but if two extensions of the same type are passed, the latter
566 // will replace the former. A Tag overwrites all former values and typically
567 // only makes sense as the first argument. The resulting tag is returned after
568 // canonicalizing using the Default CanonType. If one or more errors are
569 // encountered, one of the errors is returned.
570 func Compose(part ...interface{}) (t Tag, err error) {
571 return Default.Compose(part...)
574 // Compose creates a Tag from individual parts, which may be of type Tag, Base,
575 // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
576 // Base, Script or Region or slice of type Variant or Extension is passed more
577 // than once, the latter will overwrite the former. Variants and Extensions are
578 // accumulated, but if two extensions of the same type are passed, the latter
579 // will replace the former. A Tag overwrites all former values and typically
580 // only makes sense as the first argument. The resulting tag is returned after
581 // canonicalizing using CanonType c. If one or more errors are encountered,
582 // one of the errors is returned.
583 func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
585 if err = b.update(part...); err != nil {
588 t, _ = b.tag.canonicalize(c)
590 if len(b.ext) > 0 || len(b.variant) > 0 {
591 sort.Sort(sortVariant(b.variant))
594 b.ext = append(b.ext, b.private)
596 n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...)
597 buf := make([]byte, n)
598 p := t.genCoreBytes(buf)
600 p += appendTokens(buf[p:], b.variant...)
602 p += appendTokens(buf[p:], b.ext...)
603 t.str = string(buf[:p])
604 } else if b.private != "" {
611 type builder struct {
614 private string // the x extension
621 func (b *builder) addExt(e string) {
623 } else if e[0] == 'x' {
626 b.ext = append(b.ext, e)
630 var errInvalidArgument = errors.New("invalid Extension or Variant")
632 func (b *builder) update(part ...interface{}) (err error) {
633 replace := func(l *[]string, s string, eq func(a, b string) bool) bool {
635 b.err = errInvalidArgument
638 for i, v := range *l {
646 for _, x := range part {
647 switch v := x.(type) {
650 b.tag.region = v.region
651 b.tag.script = v.script
654 for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; {
656 b.variant = append(b.variant, x)
658 b.ext, b.private = nil, ""
659 for i, e := int(v.pExt), ""; i < len(v.str); {
660 i, e = getExtension(v.str, i)
665 b.tag.lang = v.langID
667 b.tag.script = v.scriptID
669 b.tag.region = v.regionID
671 if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) {
672 b.variant = append(b.variant, v.variant)
675 if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) {
680 for _, x := range v {
684 b.ext, b.private = nil, ""
685 for _, e := range v {
688 // TODO: support parsing of raw strings based on morphology or just extensions?
696 func tokenLen(token ...string) (n int) {
697 for _, t := range token {
703 func appendTokens(b []byte, token ...string) int {
705 for _, t := range token {
713 type sortVariant []string
715 func (s sortVariant) Len() int {
719 func (s sortVariant) Swap(i, j int) {
720 s[j], s[i] = s[i], s[j]
723 func (s sortVariant) Less(i, j int) bool {
724 return variantIndex[s[i]] < variantIndex[s[j]]
727 func findExt(list []string, x byte) int {
728 for i, e := range list {
736 // getExtension returns the name, body and end position of the extension.
737 func getExtension(s string, p int) (end int, ext string) {
744 end = nextExtension(s, p)
748 // nextExtension finds the next extension within the string, searching
749 // for the -<char>- pattern from position p.
750 // In the fast majority of cases, language tags will have at most
751 // one extension and extensions tend to be small.
752 func nextExtension(s string, p int) int {
753 for n := len(s) - 3; p < n; {
766 var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
768 // ParseAcceptLanguage parses the contents of a Accept-Language header as
769 // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
770 // a list of corresponding quality weights. It is more permissive than RFC 2616
771 // and may return non-nil slices even if the input is not valid.
772 // The Tags will be sorted by highest weight first and then by first occurrence.
773 // Tags with a weight of zero will be dropped. An error will be returned if the
774 // input could not be parsed.
775 func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
778 if entry, s = split(s, ','); entry == "" {
782 entry, weight := split(entry, ';')
784 // Scan the language.
785 t, err := Parse(entry)
787 id, ok := acceptFallback[entry]
794 // Scan the optional weight.
797 weight = consume(weight, 'q')
798 weight = consume(weight, '=')
799 // consume returns the empty string when a token could not be
800 // consumed, resulting in an error for ParseFloat.
801 if w, err = strconv.ParseFloat(weight, 32); err != nil {
802 return nil, nil, errInvalidWeight
804 // Drop tags with a quality weight of 0.
811 q = append(q, float32(w))
813 sortStable(&tagSort{tag, q})
817 // consume removes a leading token c from s and returns the result or the empty
818 // string if there is no such token.
819 func consume(s string, c byte) string {
820 if s == "" || s[0] != c {
823 return strings.TrimSpace(s[1:])
826 func split(s string, c byte) (head, tail string) {
827 if i := strings.IndexByte(s, c); i >= 0 {
828 return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
830 return strings.TrimSpace(s), ""
833 // Add hack mapping to deal with a small number of cases that that occur
834 // in Accept-Language (with reasonable frequency).
835 var acceptFallback = map[string]langID{
840 "*": _mul, // defined in the spec to match all languages.
843 type tagSort struct {
848 func (s *tagSort) Len() int {
852 func (s *tagSort) Less(i, j int) bool {
853 return s.q[i] > s.q[j]
856 func (s *tagSort) Swap(i, j int) {
857 s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
858 s.q[i], s.q[j] = s.q[j], s.q[i]