1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package runes provide transforms for UTF-8 encoded text.
6 package runes // import "golang.org/x/text/runes"
12 "golang.org/x/text/transform"
15 // A Set is a collection of runes.
17 // Contains returns true if r is contained in the set.
21 type setFunc func(rune) bool
23 func (s setFunc) Contains(r rune) bool {
27 // Note: using funcs here instead of wrapping types result in cleaner
28 // documentation and a smaller API.
30 // In creates a Set with a Contains method that returns true for all runes in
31 // the given RangeTable.
32 func In(rt *unicode.RangeTable) Set {
33 return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
36 // In creates a Set with a Contains method that returns true for all runes not
37 // in the given RangeTable.
38 func NotIn(rt *unicode.RangeTable) Set {
39 return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
42 // Predicate creates a Set with a Contains method that returns f(r).
43 func Predicate(f func(rune) bool) Set {
47 // Transformer implements the transform.Transformer interface.
48 type Transformer struct {
49 t transform.SpanningTransformer
52 func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
53 return t.t.Transform(dst, src, atEOF)
56 func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
57 return t.t.Span(b, atEOF)
60 func (t Transformer) Reset() { t.t.Reset() }
62 // Bytes returns a new byte slice with the result of converting b using t. It
63 // calls Reset on t. It returns nil if any error was found. This can only happen
64 // if an error-producing Transformer is passed to If.
65 func (t Transformer) Bytes(b []byte) []byte {
66 b, _, err := transform.Bytes(t, b)
73 // String returns a string with the result of converting s using t. It calls
74 // Reset on t. It returns the empty string if any error was found. This can only
75 // happen if an error-producing Transformer is passed to If.
76 func (t Transformer) String(s string) string {
77 s, _, err := transform.String(t, s)
85 // - Copy: copying strings and bytes in whole-rune units.
86 // - Validation (maybe)
87 // - Well-formed-ness (maybe)
89 const runeErrorString = string(utf8.RuneError)
91 // Remove returns a Transformer that removes runes r for which s.Contains(r).
92 // Illegal input bytes are replaced by RuneError before being passed to f.
93 func Remove(s Set) Transformer {
94 if f, ok := s.(setFunc); ok {
95 // This little trick cuts the running time of BenchmarkRemove for sets
96 // created by Predicate roughly in half.
97 // TODO: special-case RangeTables as well.
98 return Transformer{remove(f)}
100 return Transformer{remove(s.Contains)}
103 // TODO: remove transform.RemoveFunc.
105 type remove func(r rune) bool
107 func (remove) Reset() {}
109 // Span implements transform.Spanner.
110 func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
111 for r, size := rune(0), 0; n < len(src); {
112 if r = rune(src[n]); r < utf8.RuneSelf {
114 } else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
116 if !atEOF && !utf8.FullRune(src[n:]) {
117 err = transform.ErrShortSrc
119 err = transform.ErrEndOfSpan
124 err = transform.ErrEndOfSpan
132 // Transform implements transform.Transformer.
133 func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
134 for r, size := rune(0), 0; nSrc < len(src); {
135 if r = rune(src[nSrc]); r < utf8.RuneSelf {
137 } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
139 if !atEOF && !utf8.FullRune(src[nSrc:]) {
140 err = transform.ErrShortSrc
143 // We replace illegal bytes with RuneError. Not doing so might
144 // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
145 // The resulting byte sequence may subsequently contain runes
146 // for which t(r) is true that were passed unnoticed.
147 if !t(utf8.RuneError) {
148 if nDst+3 > len(dst) {
149 err = transform.ErrShortDst
152 dst[nDst+0] = runeErrorString[0]
153 dst[nDst+1] = runeErrorString[1]
154 dst[nDst+2] = runeErrorString[2]
164 if nDst+size > len(dst) {
165 err = transform.ErrShortDst
168 for i := 0; i < size; i++ {
169 dst[nDst] = src[nSrc]
177 // Map returns a Transformer that maps the runes in the input using the given
178 // mapping. Illegal bytes in the input are converted to utf8.RuneError before
179 // being passed to the mapping func.
180 func Map(mapping func(rune) rune) Transformer {
181 return Transformer{mapper(mapping)}
184 type mapper func(rune) rune
186 func (mapper) Reset() {}
188 // Span implements transform.Spanner.
189 func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
190 for r, size := rune(0), 0; n < len(src); n += size {
191 if r = rune(src[n]); r < utf8.RuneSelf {
193 } else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
195 if !atEOF && !utf8.FullRune(src[n:]) {
196 err = transform.ErrShortSrc
198 err = transform.ErrEndOfSpan
203 err = transform.ErrEndOfSpan
210 // Transform implements transform.Transformer.
211 func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
213 var b [utf8.UTFMax]byte
215 for r, size := rune(0), 0; nSrc < len(src); {
216 if r = rune(src[nSrc]); r < utf8.RuneSelf {
217 if replacement = t(r); replacement < utf8.RuneSelf {
218 if nDst == len(dst) {
219 err = transform.ErrShortDst
222 dst[nDst] = byte(replacement)
228 } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
230 if !atEOF && !utf8.FullRune(src[nSrc:]) {
231 err = transform.ErrShortSrc
235 if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
236 if nDst+3 > len(dst) {
237 err = transform.ErrShortDst
240 dst[nDst+0] = runeErrorString[0]
241 dst[nDst+1] = runeErrorString[1]
242 dst[nDst+2] = runeErrorString[2]
247 } else if replacement = t(r); replacement == r {
248 if nDst+size > len(dst) {
249 err = transform.ErrShortDst
252 for i := 0; i < size; i++ {
253 dst[nDst] = src[nSrc]
260 n := utf8.EncodeRune(b[:], replacement)
262 if nDst+n > len(dst) {
263 err = transform.ErrShortDst
266 for i := 0; i < n; i++ {
275 // ReplaceIllFormed returns a transformer that replaces all input bytes that are
276 // not part of a well-formed UTF-8 code sequence with utf8.RuneError.
277 func ReplaceIllFormed() Transformer {
278 return Transformer{&replaceIllFormed{}}
281 type replaceIllFormed struct{ transform.NopResetter }
283 func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
286 if src[n] < utf8.RuneSelf {
291 r, size := utf8.DecodeRune(src[n:])
293 // Look for a valid non-ASCII rune.
294 if r != utf8.RuneError || size != 1 {
299 // Look for short source data.
300 if !atEOF && !utf8.FullRune(src[n:]) {
301 err = transform.ErrShortSrc
305 // We have an invalid rune.
306 err = transform.ErrEndOfSpan
312 func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
313 for nSrc < len(src) {
315 if r := src[nSrc]; r < utf8.RuneSelf {
316 if nDst == len(dst) {
317 err = transform.ErrShortDst
326 // Look for a valid non-ASCII rune.
327 if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
328 if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
329 err = transform.ErrShortDst
337 // Look for short source data.
338 if !atEOF && !utf8.FullRune(src[nSrc:]) {
339 err = transform.ErrShortSrc
343 // We have an invalid rune.
344 if nDst+3 > len(dst) {
345 err = transform.ErrShortDst
348 dst[nDst+0] = runeErrorString[0]
349 dst[nDst+1] = runeErrorString[1]
350 dst[nDst+2] = runeErrorString[2]
354 return nDst, nSrc, err