// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package main import ( "math" "math/rand" "strings" "unicode" "unicode/utf16" "unicode/utf8" "golang.org/x/text/language" "golang.org/x/text/unicode/norm" ) // TODO: replace with functionality in language package. // parent computes the parent language for the given language. // It returns false if the parent is already root. func parent(locale string) (parent string, ok bool) { if locale == "und" { return "", false } if i := strings.LastIndex(locale, "-"); i != -1 { return locale[:i], true } return "und", true } // rewriter is used to both unique strings and create variants of strings // to add to the test set. type rewriter struct { seen map[string]bool addCases bool } func newRewriter() *rewriter { return &rewriter{ seen: make(map[string]bool), } } func (r *rewriter) insert(a []string, s string) []string { if !r.seen[s] { r.seen[s] = true a = append(a, s) } return a } // rewrite takes a sequence of strings in, adds variants of the these strings // based on options and removes duplicates. func (r *rewriter) rewrite(ss []string) []string { ns := []string{} for _, s := range ss { ns = r.insert(ns, s) if r.addCases { rs := []rune(s) rn := rs[0] for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) { rs[0] = c ns = r.insert(ns, string(rs)) } } } return ns } // exemplarySet holds a parsed set of characters from the exemplarCharacters table. type exemplarySet struct { typ exemplarType set []string charIndex int // cumulative total of phrases, including this set } type phraseGenerator struct { sets [exN]exemplarySet n int } func (g *phraseGenerator) init(id string) { ec := exemplarCharacters loc := language.Make(id).String() // get sets for locale or parent locale if the set is not defined. for i := range g.sets { for p, ok := loc, true; ok; p, ok = parent(p) { if set, ok := ec[p]; ok && set[i] != "" { g.sets[i].set = strings.Split(set[i], " ") break } } } r := newRewriter() r.addCases = *cases for i := range g.sets { g.sets[i].set = r.rewrite(g.sets[i].set) } // compute indexes for i, set := range g.sets { g.n += len(set.set) g.sets[i].charIndex = g.n } } // phrase returns the ith phrase, where i < g.n. func (g *phraseGenerator) phrase(i int) string { for _, set := range g.sets { if i < set.charIndex { return set.set[i-(set.charIndex-len(set.set))] } } panic("index out of range") } // generate generates inputs by combining all pairs of examplar strings. // If doNorm is true, all input strings are normalized to NFC. // TODO: allow other variations, statistical models, and random // trailing sequences. func (g *phraseGenerator) generate(doNorm bool) []Input { const ( M = 1024 * 1024 buf8Size = 30 * M buf16Size = 10 * M ) // TODO: use a better way to limit the input size. if sq := int(math.Sqrt(float64(*limit))); g.n > sq { g.n = sq } size := g.n * g.n a := make([]Input, 0, size) buf8 := make([]byte, 0, buf8Size) buf16 := make([]uint16, 0, buf16Size) addInput := func(str string) { buf8 = buf8[len(buf8):] buf16 = buf16[len(buf16):] if len(str) > cap(buf8) { buf8 = make([]byte, 0, buf8Size) } if len(str) > cap(buf16) { buf16 = make([]uint16, 0, buf16Size) } if doNorm { buf8 = norm.NFD.AppendString(buf8, str) } else { buf8 = append(buf8, str...) } buf16 = appendUTF16(buf16, buf8) a = append(a, makeInput(buf8, buf16)) } for i := 0; i < g.n; i++ { p1 := g.phrase(i) addInput(p1) for j := 0; j < g.n; j++ { p2 := g.phrase(j) addInput(p1 + p2) } } // permutate rnd := rand.New(rand.NewSource(int64(rand.Int()))) for i := range a { j := i + rnd.Intn(len(a)-i) a[i], a[j] = a[j], a[i] a[i].index = i // allow restoring this order if input is used multiple times. } return a } func appendUTF16(buf []uint16, s []byte) []uint16 { for len(s) > 0 { r, sz := utf8.DecodeRune(s) s = s[sz:] r1, r2 := utf16.EncodeRune(r) if r1 != 0xFFFD { buf = append(buf, uint16(r1), uint16(r2)) } else { buf = append(buf, uint16(r)) } } return buf }