ICU 53.1  53.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
normalizer2.h
Go to the documentation of this file.
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2013, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: normalizer2.h
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009nov22
14 * created by: Markus W. Scherer
15 */
16 
17 #ifndef __NORMALIZER2_H__
18 #define __NORMALIZER2_H__
19 
25 #include "unicode/utypes.h"
26 
27 #if !UCONFIG_NO_NORMALIZATION
28 
29 #include "unicode/uniset.h"
30 #include "unicode/unistr.h"
31 #include "unicode/unorm2.h"
32 
34 
79 public:
84  ~Normalizer2();
85 
97  static const Normalizer2 *
98  getNFCInstance(UErrorCode &errorCode);
99 
111  static const Normalizer2 *
112  getNFDInstance(UErrorCode &errorCode);
113 
125  static const Normalizer2 *
126  getNFKCInstance(UErrorCode &errorCode);
127 
139  static const Normalizer2 *
140  getNFKDInstance(UErrorCode &errorCode);
141 
153  static const Normalizer2 *
154  getNFKCCasefoldInstance(UErrorCode &errorCode);
155 
177  static const Normalizer2 *
178  getInstance(const char *packageName,
179  const char *name,
180  UNormalization2Mode mode,
181  UErrorCode &errorCode);
182 
194  normalize(const UnicodeString &src, UErrorCode &errorCode) const {
195  UnicodeString result;
196  normalize(src, result, errorCode);
197  return result;
198  }
212  virtual UnicodeString &
213  normalize(const UnicodeString &src,
214  UnicodeString &dest,
215  UErrorCode &errorCode) const = 0;
230  virtual UnicodeString &
231  normalizeSecondAndAppend(UnicodeString &first,
232  const UnicodeString &second,
233  UErrorCode &errorCode) const = 0;
248  virtual UnicodeString &
249  append(UnicodeString &first,
250  const UnicodeString &second,
251  UErrorCode &errorCode) const = 0;
252 
266  virtual UBool
267  getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
268 
293  virtual UBool
294  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
295 
311  virtual UChar32
312  composePair(UChar32 a, UChar32 b) const;
313 
322  virtual uint8_t
323  getCombiningClass(UChar32 c) const;
324 
339  virtual UBool
340  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
341 
358  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
359 
382  virtual int32_t
383  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
384 
398  virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
399 
414  virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
415 
429  virtual UBool isInert(UChar32 c) const = 0;
430 };
431 
444 public:
455  FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
456  norm2(n2), set(filterSet) {}
457 
463 
477  virtual UnicodeString &
478  normalize(const UnicodeString &src,
479  UnicodeString &dest,
480  UErrorCode &errorCode) const;
495  virtual UnicodeString &
497  const UnicodeString &second,
498  UErrorCode &errorCode) const;
513  virtual UnicodeString &
514  append(UnicodeString &first,
515  const UnicodeString &second,
516  UErrorCode &errorCode) const;
517 
529  virtual UBool
530  getDecomposition(UChar32 c, UnicodeString &decomposition) const;
531 
543  virtual UBool
544  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
545 
556  virtual UChar32
557  composePair(UChar32 a, UChar32 b) const;
558 
567  virtual uint8_t
568  getCombiningClass(UChar32 c) const;
569 
581  virtual UBool
582  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
595  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
607  virtual int32_t
608  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
609 
618  virtual UBool hasBoundaryBefore(UChar32 c) const;
619 
628  virtual UBool hasBoundaryAfter(UChar32 c) const;
629 
637  virtual UBool isInert(UChar32 c) const;
638 private:
639  UnicodeString &
640  normalize(const UnicodeString &src,
641  UnicodeString &dest,
642  USetSpanCondition spanCondition,
643  UErrorCode &errorCode) const;
644 
645  UnicodeString &
647  const UnicodeString &second,
648  UBool doNormalize,
649  UErrorCode &errorCode) const;
650 
651  const Normalizer2 &norm2;
652  const UnicodeSet &set;
653 };
654 
656 
657 #endif // !UCONFIG_NO_NORMALIZATION
658 #endif // __NORMALIZER2_H__
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context...
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:194
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one. ...
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
C++ API: Unicode String.
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:78
C API: New API for Unicode Normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context...
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:298
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:455
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:276
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:150
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:42
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
Basic definitions for ICU, for both C and C++ APIs.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:357
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:245
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:221
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:443
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:91
int8_t UBool
The ICU boolean type.
Definition: umachine.h:200
C++ API: Unicode Set.