include/unicode/ubrk.h

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 * Copyright (C) 1996-2015, International Business Machines Corporation and others.
   6 * All Rights Reserved.
   7 ******************************************************************************
   8 */
   9
  10 #ifndef UBRK_H
  11 #define UBRK_H
  12
  13 #include "unicode/utypes.h"
  14 #include "unicode/uloc.h"
  15 #include "unicode/utext.h"
  16
  17 #if U_SHOW_CPLUSPLUS_API
  18 #include "unicode/localpointer.h"
  19 #endif   // U_SHOW_CPLUSPLUS_API
  20
  21 /**
  22  * A text-break iterator.
  23  *  For usage in C programs.
  24  */
  25 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
  26 #   define UBRK_TYPEDEF_UBREAK_ITERATOR
  27     /**
  28      *  Opaque type representing an ICU Break iterator object.
  29      *  @stable ICU 2.0
  30      */
  31     typedef struct UBreakIterator UBreakIterator;
  32 #endif
  33
  34 #if !UCONFIG_NO_BREAK_ITERATION
  35
  36 #include "unicode/parseerr.h"
  37
  38 /**
  39  * \file
  40  * \brief C API: BreakIterator
  41  *
  42  * <h2> BreakIterator C API </h2>
  43  *
  44  * The BreakIterator C API defines  methods for finding the location
  45  * of boundaries in text. Pointer to a UBreakIterator maintain a
  46  * current position and scan over text returning the index of characters
  47  * where boundaries occur.
  48  * <p>
  49  * Line boundary analysis determines where a text string can be broken
  50  * when line-wrapping. The mechanism correctly handles punctuation and
  51  * hyphenated words.
  52  * <p>
  53  * Note: The locale keyword "lb" can be used to modify line break
  54  * behavior according to the CSS level 3 line-break options, see
  55  * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
  56  * "ja@lb=strict", "zh@lb=loose".
  57  * <p>
  58  * Sentence boundary analysis allows selection with correct
  59  * interpretation of periods within numbers and abbreviations, and
  60  * trailing punctuation marks such as quotation marks and parentheses.
  61  * <p>
  62  * Note: The locale keyword "ss" can be used to enable use of
  63  * segmentation suppression data (preventing breaks in English after
  64  * abbreviations such as "Mr." or "Est.", for example), as follows:
  65  * "en@ss=standard".
  66  * <p>
  67  * Word boundary analysis is used by search and replace functions, as
  68  * well as within text editing applications that allow the user to
  69  * select words with a double click. Word selection provides correct
  70  * interpretation of punctuation marks within and following
  71  * words. Characters that are not part of a word, such as symbols or
  72  * punctuation marks, have word-breaks on both sides.
  73  * <p>
  74  * Character boundary analysis identifies the boundaries of
  75  * "Extended Grapheme Clusters", which are groupings of codepoints
  76  * that should be treated as character-like units for many text operations.
  77  * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
  78  * http://www.unicode.org/reports/tr29/ for additional information
  79  * on grapheme clusters and guidelines on their use.
  80  * <p>
  81  * Title boundary analysis locates all positions,
  82  * typically starts of words, that should be set to Title Case
  83  * when title casing the text.
  84  * <p>
  85  * The text boundary positions are found according to the rules
  86  * described in Unicode Standard Annex #29, Text Boundaries, and
  87  * Unicode Standard Annex #14, Line Breaking Properties.  These
  88  * are available at http://www.unicode.org/reports/tr14/ and
  89  * http://www.unicode.org/reports/tr29/.
  90  * <p>
  91  * In addition to the plain C API defined in this header file, an
  92  * object oriented C++ API with equivalent functionality is defined in the
  93  * file brkiter.h.
  94  * <p>
  95  * Code snippets illustrating the use of the Break Iterator APIs
  96  * are available in the ICU User Guide,
  97  * https://unicode-org.github.io/icu/userguide/boundaryanalysis/
  98  * and in the sample program icu/source/samples/break/break.cpp
  99  */
 100
 101 /** The possible types of text boundaries.  @stable ICU 2.0 */
 102 typedef enum UBreakIteratorType {
 103   /** Character breaks  @stable ICU 2.0 */
 104   UBRK_CHARACTER = 0,
 105   /** Word breaks @stable ICU 2.0 */
 106   UBRK_WORD = 1,
 107   /** Line breaks @stable ICU 2.0 */
 108   UBRK_LINE = 2,
 109   /** Sentence breaks @stable ICU 2.0 */
 110   UBRK_SENTENCE = 3,
 111
 112 #ifndef U_HIDE_DEPRECATED_API
 113   /**
 114    * Title Case breaks
 115    * The iterator created using this type locates title boundaries as described for
 116    * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
 117    * please use Word Boundary iterator.
 118    *
 119    * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
 120    */
 121   UBRK_TITLE = 4,
 122     /**
 123      * One more than the highest normal UBreakIteratorType value.
 124      * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
 125      */
 126     UBRK_COUNT = 5
 127 #endif  // U_HIDE_DEPRECATED_API
 128 } UBreakIteratorType;
 129
 130 /** Value indicating all text boundaries have been returned.
 131  *  @stable ICU 2.0
 132  */
 133 #define UBRK_DONE ((int32_t) -1)
 134
 135
 136 /**
 137  *  Enum constants for the word break tags returned by
 138  *  getRuleStatus().  A range of values is defined for each category of
 139  *  word, to allow for further subdivisions of a category in future releases.
 140  *  Applications should check for tag values falling within the range, rather
 141  *  than for single individual values.
 142  *
 143  * The numeric values of all of these constants are stable (will not change).
 144  *
 145  * @stable ICU 2.2
 146 */
 147 typedef enum UWordBreak {
 148     /** Tag value for "words" that do not fit into any of other categories.
 149      *  Includes spaces and most punctuation. */
 150     UBRK_WORD_NONE           = 0,
 151     /** Upper bound for tags for uncategorized words. */
 152     UBRK_WORD_NONE_LIMIT     = 100,
 153     /** Tag value for words that appear to be numbers, lower limit.    */
 154     UBRK_WORD_NUMBER         = 100,
 155     /** Tag value for words that appear to be numbers, upper limit.    */
 156     UBRK_WORD_NUMBER_LIMIT   = 200,
 157     /** Tag value for words that contain letters, excluding
 158      *  hiragana, katakana or ideographic characters, lower limit.    */
 159     UBRK_WORD_LETTER         = 200,
 160     /** Tag value for words containing letters, upper limit  */
 161     UBRK_WORD_LETTER_LIMIT   = 300,
 162     /** Tag value for words containing kana characters, lower limit */
 163     UBRK_WORD_KANA           = 300,
 164     /** Tag value for words containing kana characters, upper limit */
 165     UBRK_WORD_KANA_LIMIT     = 400,
 166     /** Tag value for words containing ideographic characters, lower limit */
 167     UBRK_WORD_IDEO           = 400,
 168     /** Tag value for words containing ideographic characters, upper limit */
 169     UBRK_WORD_IDEO_LIMIT     = 500
 170 } UWordBreak;
 171
 172 /**
 173  *  Enum constants for the line break tags returned by getRuleStatus().
 174  *  A range of values is defined for each category of
 175  *  word, to allow for further subdivisions of a category in future releases.
 176  *  Applications should check for tag values falling within the range, rather
 177  *  than for single individual values.
 178  *
 179  * The numeric values of all of these constants are stable (will not change).
 180  *
 181  * @stable ICU 2.8
 182 */
 183 typedef enum ULineBreakTag {
 184     /** Tag value for soft line breaks, positions at which a line break
 185       *  is acceptable but not required                */
 186     UBRK_LINE_SOFT            = 0,
 187     /** Upper bound for soft line breaks.              */
 188     UBRK_LINE_SOFT_LIMIT      = 100,
 189     /** Tag value for a hard, or mandatory line break  */
 190     UBRK_LINE_HARD            = 100,
 191     /** Upper bound for hard line breaks.              */
 192     UBRK_LINE_HARD_LIMIT      = 200
 193 } ULineBreakTag;
 194
 195
 196
 197 /**
 198  *  Enum constants for the sentence break tags returned by getRuleStatus().
 199  *  A range of values is defined for each category of
 200  *  sentence, to allow for further subdivisions of a category in future releases.
 201  *  Applications should check for tag values falling within the range, rather
 202  *  than for single individual values.
 203  *
 204  * The numeric values of all of these constants are stable (will not change).
 205  *
 206  * @stable ICU 2.8
 207 */
 208 typedef enum USentenceBreakTag {
 209     /** Tag value for for sentences  ending with a sentence terminator
 210       * ('.', '?', '!', etc.) character, possibly followed by a
 211       * hard separator (CR, LF, PS, etc.)
 212       */
 213     UBRK_SENTENCE_TERM       = 0,
 214     /** Upper bound for tags for sentences ended by sentence terminators.    */
 215     UBRK_SENTENCE_TERM_LIMIT = 100,
 216     /** Tag value for for sentences that do not contain an ending
 217       * sentence terminator ('.', '?', '!', etc.) character, but
 218       * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
 219       */
 220     UBRK_SENTENCE_SEP        = 100,
 221     /** Upper bound for tags for sentences ended by a separator.              */
 222     UBRK_SENTENCE_SEP_LIMIT  = 200
 223     /** Tag value for a hard, or mandatory line break  */
 224 } USentenceBreakTag;
 225
 226
 227 /**
 228  * Open a new UBreakIterator for locating text boundaries for a specified locale.
 229  * A UBreakIterator may be used for detecting character, line, word,
 230  * and sentence breaks in text.
 231  * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
 232  * UBRK_LINE, UBRK_SENTENCE
 233  * @param locale The locale specifying the text-breaking conventions. Note that
 234  * locale keys such as "lb" and "ss" may be used to modify text break behavior,
 235  * see general discussion of BreakIterator C API.
 236  * @param text The text to be iterated over. May be null, in which case ubrk_setText() is
 237  *        used to specify the text to be iterated.
 238  * @param textLength The number of characters in text, or -1 if null-terminated.
 239  * @param status A UErrorCode to receive any errors.
 240  * @return A UBreakIterator for the specified locale.
 241  * @see ubrk_openRules
 242  * @stable ICU 2.0
 243  */
 244 U_CAPI UBreakIterator* U_EXPORT2
 245 ubrk_open(UBreakIteratorType type,
 246       const char *locale,
 247       const UChar *text,
 248       int32_t textLength,
 249       UErrorCode *status);
 250
 251 /**
 252  * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
 253  * The rule syntax is ... (TBD)
 254  * @param rules A set of rules specifying the text breaking conventions.
 255  * @param rulesLength The number of characters in rules, or -1 if null-terminated.
 256  * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
 257  *        used to specify the text to be iterated.
 258  * @param textLength The number of characters in text, or -1 if null-terminated.
 259  * @param parseErr   Receives position and context information for any syntax errors
 260  *                   detected while parsing the rules.
 261  * @param status A UErrorCode to receive any errors.
 262  * @return A UBreakIterator for the specified rules.
 263  * @see ubrk_open
 264  * @stable ICU 2.2
 265  */
 266 U_CAPI UBreakIterator* U_EXPORT2
 267 ubrk_openRules(const UChar     *rules,
 268                int32_t         rulesLength,
 269                const UChar     *text,
 270                int32_t          textLength,
 271                UParseError     *parseErr,
 272                UErrorCode      *status);
 273
 274 /**
 275  * Open a new UBreakIterator for locating text boundaries using precompiled binary rules.
 276  * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.
 277  * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not
 278  * compatible across different major versions of ICU, nor across platforms of different
 279  * endianness or different base character set family (ASCII vs EBCDIC).
 280  * @param binaryRules A set of compiled binary rules specifying the text breaking
 281  *                    conventions. Ownership of the storage containing the compiled
 282  *                    rules remains with the caller of this function. The compiled
 283  *                    rules must not be modified or deleted during the life of the
 284  *                    break iterator.
 285  * @param rulesLength The length of binaryRules in bytes; must be >= 0.
 286  * @param text        The text to be iterated over.  May be null, in which case
 287  *                    ubrk_setText() is used to specify the text to be iterated.
 288  * @param textLength  The number of characters in text, or -1 if null-terminated.
 289  * @param status      Pointer to UErrorCode to receive any errors.
 290  * @return            UBreakIterator for the specified rules.
 291  * @see ubrk_getBinaryRules
 292  * @stable ICU 59
 293  */
 294 U_CAPI UBreakIterator* U_EXPORT2
 295 ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
 296                      const UChar *  text, int32_t textLength,
 297                      UErrorCode *   status);
 298
 299 #ifndef U_HIDE_DEPRECATED_API
 300
 301 /**
 302  * Thread safe cloning operation
 303  * @param bi iterator to be cloned
 304  * @param stackBuffer <em>Deprecated functionality as of ICU 52, use NULL.</em><br>
 305  *  user allocated space for the new clone. If NULL new memory will be allocated.
 306  *  If buffer is not large enough, new memory will be allocated.
 307  *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE.
 308  * @param pBufferSize <em>Deprecated functionality as of ICU 52, use NULL or 1.</em><br>
 309  *  pointer to size of allocated space.
 310  *  If *pBufferSize == 0, a sufficient size for use in cloning will
 311  *  be returned ('pre-flighting')
 312  *  If *pBufferSize is not enough for a stack-based safe clone,
 313  *  new memory will be allocated.
 314  * @param status to indicate whether the operation went on smoothly or there were errors
 315  *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used
 316  * if pBufferSize != NULL and any allocations were necessary
 317  * @return pointer to the new clone
 318  * @deprecated ICU 69 Use ubrk_clone() instead.
 319  */
 320 U_DEPRECATED UBreakIterator * U_EXPORT2
 321 ubrk_safeClone(
 322           const UBreakIterator *bi,
 323           void *stackBuffer,
 324           int32_t *pBufferSize,
 325           UErrorCode *status);
 326
 327 #endif /* U_HIDE_DEPRECATED_API */
 328
 329 /**
 330  * Thread safe cloning operation.
 331  * @param bi iterator to be cloned
 332  * @param status to indicate whether the operation went on smoothly or there were errors
 333  * @return pointer to the new clone
 334  * @stable ICU 69
 335  */
 336 U_CAPI UBreakIterator * U_EXPORT2
 337 ubrk_clone(const UBreakIterator *bi,
 338            UErrorCode *status);
 339
 340 #ifndef U_HIDE_DEPRECATED_API
 341
 342 /**
 343   * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
 344   * @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.
 345   */
 346 #define U_BRK_SAFECLONE_BUFFERSIZE 1
 347
 348 #endif /* U_HIDE_DEPRECATED_API */
 349
 350 /**
 351 * Close a UBreakIterator.
 352 * Once closed, a UBreakIterator may no longer be used.
 353 * @param bi The break iterator to close.
 354  * @stable ICU 2.0
 355 */
 356 U_CAPI void U_EXPORT2
 357 ubrk_close(UBreakIterator *bi);
 358
 359 #if U_SHOW_CPLUSPLUS_API
 360
 361 U_NAMESPACE_BEGIN
 362
 363 /**
 364  * \class LocalUBreakIteratorPointer
 365  * "Smart pointer" class, closes a UBreakIterator via ubrk_close().
 366  * For most methods see the LocalPointerBase base class.
 367  *
 368  * @see LocalPointerBase
 369  * @see LocalPointer
 370  * @stable ICU 4.4
 371  */
 372 U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);
 373
 374 U_NAMESPACE_END
 375
 376 #endif
 377
 378 /**
 379  * Sets an existing iterator to point to a new piece of text.
 380  * The break iterator retains a pointer to the supplied text.
 381  * The caller must not modify or delete the text while the BreakIterator
 382  * retains the reference.
 383  *
 384  * @param bi The iterator to use
 385  * @param text The text to be set
 386  * @param textLength The length of the text
 387  * @param status The error code
 388  * @stable ICU 2.0
 389  */
 390 U_CAPI void U_EXPORT2
 391 ubrk_setText(UBreakIterator* bi,
 392              const UChar*    text,
 393              int32_t         textLength,
 394              UErrorCode*     status);
 395
 396
 397 /**
 398  * Sets an existing iterator to point to a new piece of text.
 399  *
 400  * All index positions returned by break iterator functions are
 401  * native indices from the UText. For example, when breaking UTF-8
 402  * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.
 403  * will be UTF-8 string indices, not UTF-16 positions.
 404  *
 405  * @param bi The iterator to use
 406  * @param text The text to be set.
 407  *             This function makes a shallow clone of the supplied UText.  This means
 408  *             that the caller is free to immediately close or otherwise reuse the
 409  *             UText that was passed as a parameter, but that the underlying text itself
 410  *             must not be altered while being referenced by the break iterator.
 411  * @param status The error code
 412  * @stable ICU 3.4
 413  */
 414 U_CAPI void U_EXPORT2
 415 ubrk_setUText(UBreakIterator* bi,
 416              UText*          text,
 417              UErrorCode*     status);
 418
 419
 420
 421 /**
 422  * Determine the most recently-returned text boundary.
 423  *
 424  * @param bi The break iterator to use.
 425  * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
 426  * \ref ubrk_first, or \ref ubrk_last.
 427  * @stable ICU 2.0
 428  */
 429 U_CAPI int32_t U_EXPORT2
 430 ubrk_current(const UBreakIterator *bi);
 431
 432 /**
 433  * Advance the iterator to the boundary following the current boundary.
 434  *
 435  * @param bi The break iterator to use.
 436  * @return The character index of the next text boundary, or UBRK_DONE
 437  * if all text boundaries have been returned.
 438  * @see ubrk_previous
 439  * @stable ICU 2.0
 440  */
 441 U_CAPI int32_t U_EXPORT2
 442 ubrk_next(UBreakIterator *bi);
 443
 444 /**
 445  * Set the iterator position to the boundary preceding the current boundary.
 446  *
 447  * @param bi The break iterator to use.
 448  * @return The character index of the preceding text boundary, or UBRK_DONE
 449  * if all text boundaries have been returned.
 450  * @see ubrk_next
 451  * @stable ICU 2.0
 452  */
 453 U_CAPI int32_t U_EXPORT2
 454 ubrk_previous(UBreakIterator *bi);
 455
 456 /**
 457  * Set the iterator position to zero, the start of the text being scanned.
 458  * @param bi The break iterator to use.
 459  * @return The new iterator position (zero).
 460  * @see ubrk_last
 461  * @stable ICU 2.0
 462  */
 463 U_CAPI int32_t U_EXPORT2
 464 ubrk_first(UBreakIterator *bi);
 465
 466 /**
 467  * Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.
 468  * This is not the same as the last character.
 469  * @param bi The break iterator to use.
 470  * @return The character offset immediately <EM>beyond</EM> the last character in the
 471  * text being scanned.
 472  * @see ubrk_first
 473  * @stable ICU 2.0
 474  */
 475 U_CAPI int32_t U_EXPORT2
 476 ubrk_last(UBreakIterator *bi);
 477
 478 /**
 479  * Set the iterator position to the first boundary preceding the specified offset.
 480  * The new position is always smaller than offset, or UBRK_DONE.
 481  * @param bi The break iterator to use.
 482  * @param offset The offset to begin scanning.
 483  * @return The text boundary preceding offset, or UBRK_DONE.
 484  * @see ubrk_following
 485  * @stable ICU 2.0
 486  */
 487 U_CAPI int32_t U_EXPORT2
 488 ubrk_preceding(UBreakIterator *bi,
 489            int32_t offset);
 490
 491 /**
 492  * Advance the iterator to the first boundary following the specified offset.
 493  * The value returned is always greater than offset, or UBRK_DONE.
 494  * @param bi The break iterator to use.
 495  * @param offset The offset to begin scanning.
 496  * @return The text boundary following offset, or UBRK_DONE.
 497  * @see ubrk_preceding
 498  * @stable ICU 2.0
 499  */
 500 U_CAPI int32_t U_EXPORT2
 501 ubrk_following(UBreakIterator *bi,
 502            int32_t offset);
 503
 504 /**
 505 * Get a locale for which text breaking information is available.
 506 * A UBreakIterator in a locale returned by this function will perform the correct
 507 * text breaking for the locale.
 508 * @param index The index of the desired locale.
 509 * @return A locale for which number text breaking information is available, or 0 if none.
 510 * @see ubrk_countAvailable
 511 * @stable ICU 2.0
 512 */
 513 U_CAPI const char* U_EXPORT2
 514 ubrk_getAvailable(int32_t index);
 515
 516 /**
 517 * Determine how many locales have text breaking information available.
 518 * This function is most useful as determining the loop ending condition for
 519 * calls to \ref ubrk_getAvailable.
 520 * @return The number of locales for which text breaking information is available.
 521 * @see ubrk_getAvailable
 522 * @stable ICU 2.0
 523 */
 524 U_CAPI int32_t U_EXPORT2
 525 ubrk_countAvailable(void);
 526
 527
 528 /**
 529 * Returns true if the specified position is a boundary position.  As a side
 530 * effect, leaves the iterator pointing to the first boundary position at
 531 * or after "offset".
 532 * @param bi The break iterator to use.
 533 * @param offset the offset to check.
 534 * @return True if "offset" is a boundary position.
 535 * @stable ICU 2.0
 536 */
 537 U_CAPI  UBool U_EXPORT2
 538 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
 539
 540 /**
 541  * Return the status from the break rule that determined the most recently
 542  * returned break position.  The values appear in the rule source
 543  * within brackets, {123}, for example.  For rules that do not specify a
 544  * status, a default value of 0 is returned.
 545  * <p>
 546  * For word break iterators, the possible values are defined in enum UWordBreak.
 547  * @stable ICU 2.2
 548  */
 549 U_CAPI  int32_t U_EXPORT2
 550 ubrk_getRuleStatus(UBreakIterator *bi);
 551
 552 /**
 553  * Get the statuses from the break rules that determined the most recently
 554  * returned break position.  The values appear in the rule source
 555  * within brackets, {123}, for example.  The default status value for rules
 556  * that do not explicitly provide one is zero.
 557  * <p>
 558  * For word break iterators, the possible values are defined in enum UWordBreak.
 559  * @param bi        The break iterator to use
 560  * @param fillInVec an array to be filled in with the status values.
 561  * @param capacity  the length of the supplied vector.  A length of zero causes
 562  *                  the function to return the number of status values, in the
 563  *                  normal way, without attempting to store any values.
 564  * @param status    receives error codes.
 565  * @return          The number of rule status values from rules that determined
 566  *                  the most recent boundary returned by the break iterator.
 567  * @stable ICU 3.0
 568  */
 569 U_CAPI  int32_t U_EXPORT2
 570 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
 571
 572 /**
 573  * Return the locale of the break iterator. You can choose between the valid and
 574  * the actual locale.
 575  * @param bi break iterator
 576  * @param type locale type (valid or actual)
 577  * @param status error code
 578  * @return locale string
 579  * @stable ICU 2.8
 580  */
 581 U_CAPI const char* U_EXPORT2
 582 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
 583
 584 /**
 585   *  Set the subject text string upon which the break iterator is operating
 586   *  without changing any other aspect of the state.
 587   *  The new and previous text strings must have the same content.
 588   *
 589   *  This function is intended for use in environments where ICU is operating on
 590   *  strings that may move around in memory.  It provides a mechanism for notifying
 591   *  ICU that the string has been relocated, and providing a new UText to access the
 592   *  string in its new position.
 593   *
 594   *  Note that the break iterator never copies the underlying text
 595   *  of a string being processed, but always operates directly on the original text
 596   *  provided by the user. Refreshing simply drops the references to the old text
 597   *  and replaces them with references to the new.
 598   *
 599   *  Caution:  this function is normally used only by very specialized
 600   *            system-level code.   One example use case is with garbage collection
 601   *            that moves the text in memory.
 602   *
 603   * @param bi         The break iterator.
 604   * @param text       The new (moved) text string.
 605   * @param status     Receives errors detected by this function.
 606   *
 607   * @stable ICU 49
 608   */
 609 U_CAPI void U_EXPORT2
 610 ubrk_refreshUText(UBreakIterator *bi,
 611                        UText          *text,
 612                        UErrorCode     *status);
 613
 614
 615 /**
 616  * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.
 617  * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator
 618  * more quickly than using ubrk_openRules. The compiled rules are not compatible across
 619  * different major versions of ICU, nor across platforms of different endianness or
 620  * different base character set family (ASCII vs EBCDIC). Supports preflighting (with
 621  * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to
 622  * the binaryRules buffer. However, whether preflighting or not, if the actual length
 623  * is greater than INT32_MAX, then the function returns 0 and sets *status to
 624  * U_INDEX_OUTOFBOUNDS_ERROR.
 625
 626  * @param bi            The break iterator to use.
 627  * @param binaryRules   Buffer to receive the compiled binary rules; set to NULL for
 628  *                      preflighting.
 629  * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for
 630  *                      preflighting. Must be >= 0.
 631  * @param status        Pointer to UErrorCode to receive any errors, such as
 632  *                      U_BUFFER_OVERFLOW_ERROR, U_INDEX_OUTOFBOUNDS_ERROR, or
 633  *                      U_ILLEGAL_ARGUMENT_ERROR.
 634  * @return              The actual byte length of the binary rules, if <= INT32_MAX;
 635  *                      otherwise 0. If not preflighting and this is larger than
 636  *                      rulesCapacity, *status will be set to an error.
 637  * @see ubrk_openBinaryRules
 638  * @stable ICU 59
 639  */
 640 U_CAPI int32_t U_EXPORT2
 641 ubrk_getBinaryRules(UBreakIterator *bi,
 642                     uint8_t *       binaryRules, int32_t rulesCapacity,
 643                     UErrorCode *    status);
 644
 645 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
 646
 647 #endif