vendor/golang.org/x/text/search/search.go

   1 // Copyright 2015 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 //go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search
   6
   7 // Package search provides language-specific search and string matching.
   8 //
   9 // Natural language matching can be intricate. For example, Danish will insist
  10 // "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note
  11 // the lack of a dot) in a case-insensitive match. This package handles such
  12 // language-specific details.
  13 //
  14 // Text passed to any of the calls in this message does not need to be
  15 // normalized.
  16 package search // import "golang.org/x/text/search"
  17
  18 import (
  19         "strings"
  20
  21         "golang.org/x/text/internal/colltab"
  22         "golang.org/x/text/language"
  23 )
  24
  25 // An Option configures a Matcher.
  26 type Option func(*Matcher)
  27
  28 var (
  29         // WholeWord restricts matches to complete words. The default is to match at
  30         // the character level.
  31         WholeWord Option = nil
  32
  33         // Exact requires that two strings are their exact equivalent. For example
  34         // å would not match aa in Danish. It overrides any of the ignore options.
  35         Exact Option = nil
  36
  37         // Loose causes case, diacritics and width to be ignored.
  38         Loose Option = loose
  39
  40         // IgnoreCase enables case-insensitive search.
  41         IgnoreCase Option = ignoreCase
  42
  43         // IgnoreDiacritics causes diacritics to be ignored ("ö" == "o").
  44         IgnoreDiacritics Option = ignoreDiacritics
  45
  46         // IgnoreWidth equates narrow with wide variants.
  47         IgnoreWidth Option = ignoreWidth
  48 )
  49
  50 func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true }
  51 func ignoreCase(m *Matcher)       { m.ignoreCase = true }
  52 func ignoreWidth(m *Matcher)      { m.ignoreWidth = true }
  53 func loose(m *Matcher) {
  54         ignoreDiacritics(m)
  55         ignoreCase(m)
  56         ignoreWidth(m)
  57 }
  58
  59 var (
  60         // Supported lists the languages for which search differs from its parent.
  61         Supported language.Coverage
  62
  63         tags []language.Tag
  64 )
  65
  66 func init() {
  67         ids := strings.Split(availableLocales, ",")
  68         tags = make([]language.Tag, len(ids))
  69         for i, s := range ids {
  70                 tags[i] = language.Raw.MustParse(s)
  71         }
  72         Supported = language.NewCoverage(tags)
  73 }
  74
  75 // New returns a new Matcher for the given language and options.
  76 func New(t language.Tag, opts ...Option) *Matcher {
  77         m := &Matcher{
  78                 w: getTable(locales[colltab.MatchLang(t, tags)]),
  79         }
  80         for _, f := range opts {
  81                 f(m)
  82         }
  83         return m
  84 }
  85
  86 // A Matcher implements language-specific string matching.
  87 type Matcher struct {
  88         w                colltab.Weighter
  89         ignoreCase       bool
  90         ignoreWidth      bool
  91         ignoreDiacritics bool
  92 }
  93
  94 // An IndexOption specifies how the Index methods of Pattern or Matcher should
  95 // match the input.
  96 type IndexOption byte
  97
  98 const (
  99         // Anchor restricts the search to the start (or end for Backwards) of the
 100         // text.
 101         Anchor IndexOption = 1 << iota
 102
 103         // Backwards starts the search from the end of the text.
 104         Backwards
 105
 106         anchorBackwards = Anchor | Backwards
 107 )
 108
 109 // Index reports the start and end position of the first occurrence of pat in b
 110 // or -1, -1 if pat is not present.
 111 func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) {
 112         // TODO: implement optimized version that does not use a pattern.
 113         return m.Compile(pat).Index(b, opts...)
 114 }
 115
 116 // IndexString reports the start and end position of the first occurrence of pat
 117 // in s or -1, -1 if pat is not present.
 118 func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) {
 119         // TODO: implement optimized version that does not use a pattern.
 120         return m.CompileString(pat).IndexString(s, opts...)
 121 }
 122
 123 // Equal reports whether a and b are equivalent.
 124 func (m *Matcher) Equal(a, b []byte) bool {
 125         _, end := m.Index(a, b, Anchor)
 126         return end == len(a)
 127 }
 128
 129 // EqualString reports whether a and b are equivalent.
 130 func (m *Matcher) EqualString(a, b string) bool {
 131         _, end := m.IndexString(a, b, Anchor)
 132         return end == len(a)
 133 }
 134
 135 // Compile compiles and returns a pattern that can be used for faster searching.
 136 func (m *Matcher) Compile(b []byte) *Pattern {
 137         p := &Pattern{m: m}
 138         iter := colltab.Iter{Weighter: m.w}
 139         for iter.SetInput(b); iter.Next(); {
 140         }
 141         p.ce = iter.Elems
 142         p.deleteEmptyElements()
 143         return p
 144 }
 145
 146 // CompileString compiles and returns a pattern that can be used for faster
 147 // searching.
 148 func (m *Matcher) CompileString(s string) *Pattern {
 149         p := &Pattern{m: m}
 150         iter := colltab.Iter{Weighter: m.w}
 151         for iter.SetInputString(s); iter.Next(); {
 152         }
 153         p.ce = iter.Elems
 154         p.deleteEmptyElements()
 155         return p
 156 }
 157
 158 // A Pattern is a compiled search string. It is safe for concurrent use.
 159 type Pattern struct {
 160         m  *Matcher
 161         ce []colltab.Elem
 162 }
 163
 164 // Design note (TODO remove):
 165 // The cost of retrieving collation elements for each rune, which is used for
 166 // search as well, is not trivial. Also, algorithms like Boyer-Moore and
 167 // Sunday require some additional precomputing.
 168
 169 // Index reports the start and end position of the first occurrence of p in b
 170 // or -1, -1 if p is not present.
 171 func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) {
 172         // Pick a large enough buffer such that we likely do not need to allocate
 173         // and small enough to not cause too much overhead initializing.
 174         var buf [8]colltab.Elem
 175
 176         it := &colltab.Iter{
 177                 Weighter: p.m.w,
 178                 Elems:    buf[:0],
 179         }
 180         it.SetInput(b)
 181
 182         var optMask IndexOption
 183         for _, o := range opts {
 184                 optMask |= o
 185         }
 186
 187         switch optMask {
 188         case 0:
 189                 return p.forwardSearch(it)
 190         case Anchor:
 191                 return p.anchoredForwardSearch(it)
 192         case Backwards, anchorBackwards:
 193                 panic("TODO: implement")
 194         default:
 195                 panic("unrecognized option")
 196         }
 197 }
 198
 199 // IndexString reports the start and end position of the first occurrence of p
 200 // in s or -1, -1 if p is not present.
 201 func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) {
 202         // Pick a large enough buffer such that we likely do not need to allocate
 203         // and small enough to not cause too much overhead initializing.
 204         var buf [8]colltab.Elem
 205
 206         it := &colltab.Iter{
 207                 Weighter: p.m.w,
 208                 Elems:    buf[:0],
 209         }
 210         it.SetInputString(s)
 211
 212         var optMask IndexOption
 213         for _, o := range opts {
 214                 optMask |= o
 215         }
 216
 217         switch optMask {
 218         case 0:
 219                 return p.forwardSearch(it)
 220         case Anchor:
 221                 return p.anchoredForwardSearch(it)
 222         case Backwards, anchorBackwards:
 223                 panic("TODO: implement")
 224         default:
 225                 panic("unrecognized option")
 226         }
 227 }
 228
 229 // TODO:
 230 // - Maybe IndexAll methods (probably not necessary).
 231 // - Some way to match patterns in a Reader (a bit tricky).
 232 // - Some fold transformer that folds text to comparable text, based on the
 233 //   search options. This is a common technique, though very different from the
 234 //   collation-based design of this package. It has a somewhat different use
 235 //   case, so probably makes sense to support both. Should probably be in a
 236 //   different package, though, as it uses completely different kind of tables
 237 //   (based on norm, cases, width and range tables.)