| Index: icu51/source/common/dictbe.h
|
| ===================================================================
|
| --- icu51/source/common/dictbe.h (revision 0)
|
| +++ icu51/source/common/dictbe.h (revision 0)
|
| @@ -0,0 +1,314 @@
|
| +/**
|
| + *******************************************************************************
|
| + * Copyright (C) 2006,2012, International Business Machines Corporation *
|
| + * and others. All Rights Reserved. *
|
| + *******************************************************************************
|
| + */
|
| +
|
| +#ifndef DICTBE_H
|
| +#define DICTBE_H
|
| +
|
| +#include "unicode/utypes.h"
|
| +#include "unicode/uniset.h"
|
| +#include "unicode/utext.h"
|
| +
|
| +#include "brkeng.h"
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +class DictionaryMatcher;
|
| +
|
| +/*******************************************************************
|
| + * DictionaryBreakEngine
|
| + */
|
| +
|
| +/**
|
| + * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
|
| + * dictionary to determine language-specific breaks.</p>
|
| + *
|
| + * <p>After it is constructed a DictionaryBreakEngine may be shared between
|
| + * threads without synchronization.</p>
|
| + */
|
| +class DictionaryBreakEngine : public LanguageBreakEngine {
|
| + private:
|
| + /**
|
| + * The set of characters handled by this engine
|
| + * @internal
|
| + */
|
| +
|
| + UnicodeSet fSet;
|
| +
|
| + /**
|
| + * The set of break types handled by this engine
|
| + * @internal
|
| + */
|
| +
|
| + uint32_t fTypes;
|
| +
|
| + /**
|
| + * <p>Default constructor.</p>
|
| + *
|
| + */
|
| + DictionaryBreakEngine();
|
| +
|
| + public:
|
| +
|
| + /**
|
| + * <p>Constructor setting the break types handled.</p>
|
| + *
|
| + * @param breakTypes A bitmap of types handled by the engine.
|
| + */
|
| + DictionaryBreakEngine( uint32_t breakTypes );
|
| +
|
| + /**
|
| + * <p>Virtual destructor.</p>
|
| + */
|
| + virtual ~DictionaryBreakEngine();
|
| +
|
| + /**
|
| + * <p>Indicate whether this engine handles a particular character for
|
| + * a particular kind of break.</p>
|
| + *
|
| + * @param c A character which begins a run that the engine might handle
|
| + * @param breakType The type of text break which the caller wants to determine
|
| + * @return TRUE if this engine handles the particular character and break
|
| + * type.
|
| + */
|
| + virtual UBool handles( UChar32 c, int32_t breakType ) const;
|
| +
|
| + /**
|
| + * <p>Find any breaks within a run in the supplied text.</p>
|
| + *
|
| + * @param text A UText representing the text. The iterator is left at
|
| + * the end of the run of characters which the engine is capable of handling
|
| + * that starts from the first (or last) character in the range.
|
| + * @param startPos The start of the run within the supplied text.
|
| + * @param endPos The end of the run within the supplied text.
|
| + * @param reverse Whether the caller is looking for breaks in a reverse
|
| + * direction.
|
| + * @param breakType The type of break desired, or -1.
|
| + * @param foundBreaks An allocated C array of the breaks found, if any
|
| + * @return The number of breaks found.
|
| + */
|
| + virtual int32_t findBreaks( UText *text,
|
| + int32_t startPos,
|
| + int32_t endPos,
|
| + UBool reverse,
|
| + int32_t breakType,
|
| + UStack &foundBreaks ) const;
|
| +
|
| + protected:
|
| +
|
| + /**
|
| + * <p>Set the character set handled by this engine.</p>
|
| + *
|
| + * @param set A UnicodeSet of the set of characters handled by the engine
|
| + */
|
| + virtual void setCharacters( const UnicodeSet &set );
|
| +
|
| + /**
|
| + * <p>Set the break types handled by this engine.</p>
|
| + *
|
| + * @param breakTypes A bitmap of types handled by the engine.
|
| + */
|
| +// virtual void setBreakTypes( uint32_t breakTypes );
|
| +
|
| + /**
|
| + * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
| + *
|
| + * @param text A UText representing the text
|
| + * @param rangeStart The start of the range of dictionary characters
|
| + * @param rangeEnd The end of the range of dictionary characters
|
| + * @param foundBreaks Output of C array of int32_t break positions, or 0
|
| + * @return The number of breaks found
|
| + */
|
| + virtual int32_t divideUpDictionaryRange( UText *text,
|
| + int32_t rangeStart,
|
| + int32_t rangeEnd,
|
| + UStack &foundBreaks ) const = 0;
|
| +
|
| +};
|
| +
|
| +/*******************************************************************
|
| + * ThaiBreakEngine
|
| + */
|
| +
|
| +/**
|
| + * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
|
| + * dictionary and heuristics to determine Thai-specific breaks.</p>
|
| + *
|
| + * <p>After it is constructed a ThaiBreakEngine may be shared between
|
| + * threads without synchronization.</p>
|
| + */
|
| +class ThaiBreakEngine : public DictionaryBreakEngine {
|
| + private:
|
| + /**
|
| + * The set of characters handled by this engine
|
| + * @internal
|
| + */
|
| +
|
| + UnicodeSet fThaiWordSet;
|
| + UnicodeSet fEndWordSet;
|
| + UnicodeSet fBeginWordSet;
|
| + UnicodeSet fSuffixSet;
|
| + UnicodeSet fMarkSet;
|
| + DictionaryMatcher *fDictionary;
|
| +
|
| + public:
|
| +
|
| + /**
|
| + * <p>Default constructor.</p>
|
| + *
|
| + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
| + * engine is deleted.
|
| + */
|
| + ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
| +
|
| + /**
|
| + * <p>Virtual destructor.</p>
|
| + */
|
| + virtual ~ThaiBreakEngine();
|
| +
|
| + protected:
|
| + /**
|
| + * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
| + *
|
| + * @param text A UText representing the text
|
| + * @param rangeStart The start of the range of dictionary characters
|
| + * @param rangeEnd The end of the range of dictionary characters
|
| + * @param foundBreaks Output of C array of int32_t break positions, or 0
|
| + * @return The number of breaks found
|
| + */
|
| + virtual int32_t divideUpDictionaryRange( UText *text,
|
| + int32_t rangeStart,
|
| + int32_t rangeEnd,
|
| + UStack &foundBreaks ) const;
|
| +
|
| +};
|
| +
|
| +#if !UCONFIG_NO_NORMALIZATION
|
| +
|
| +/*******************************************************************
|
| + * CjkBreakEngine
|
| + */
|
| +
|
| +//indicates language/script that the CjkBreakEngine will handle
|
| +enum LanguageType {
|
| + kKorean,
|
| + kChineseJapanese
|
| +};
|
| +
|
| +/**
|
| + * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
|
| + * dictionary with costs associated with each word and
|
| + * Viterbi decoding to determine CJK-specific breaks.</p>
|
| + */
|
| +class CjkBreakEngine : public DictionaryBreakEngine {
|
| + protected:
|
| + /**
|
| + * The set of characters handled by this engine
|
| + * @internal
|
| + */
|
| + UnicodeSet fHangulWordSet;
|
| + UnicodeSet fHanWordSet;
|
| + UnicodeSet fKatakanaWordSet;
|
| + UnicodeSet fHiraganaWordSet;
|
| +
|
| + DictionaryMatcher *fDictionary;
|
| +
|
| + public:
|
| +
|
| + /**
|
| + * <p>Default constructor.</p>
|
| + *
|
| + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
| + * engine is deleted. The DictionaryMatcher must contain costs for each word
|
| + * in order for the dictionary to work properly.
|
| + */
|
| + CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
|
| +
|
| + /**
|
| + * <p>Virtual destructor.</p>
|
| + */
|
| + virtual ~CjkBreakEngine();
|
| +
|
| + protected:
|
| + /**
|
| + * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
| + *
|
| + * @param text A UText representing the text
|
| + * @param rangeStart The start of the range of dictionary characters
|
| + * @param rangeEnd The end of the range of dictionary characters
|
| + * @param foundBreaks Output of C array of int32_t break positions, or 0
|
| + * @return The number of breaks found
|
| + */
|
| + virtual int32_t divideUpDictionaryRange( UText *text,
|
| + int32_t rangeStart,
|
| + int32_t rangeEnd,
|
| + UStack &foundBreaks ) const;
|
| +
|
| +};
|
| +
|
| +#endif
|
| +
|
| +/*******************************************************************
|
| + * KhmerBreakEngine
|
| + */
|
| +
|
| +/**
|
| + * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
|
| + * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
|
| + *
|
| + * <p>After it is constructed a KhmerBreakEngine may be shared between
|
| + * threads without synchronization.</p>
|
| + */
|
| +class KhmerBreakEngine : public DictionaryBreakEngine {
|
| + private:
|
| + /**
|
| + * The set of characters handled by this engine
|
| + * @internal
|
| + */
|
| +
|
| + UnicodeSet fKhmerWordSet;
|
| + UnicodeSet fEndWordSet;
|
| + UnicodeSet fBeginWordSet;
|
| + UnicodeSet fMarkSet;
|
| + DictionaryMatcher *fDictionary;
|
| +
|
| + public:
|
| +
|
| + /**
|
| + * <p>Default constructor.</p>
|
| + *
|
| + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
| + * engine is deleted.
|
| + */
|
| + KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
| +
|
| + /**
|
| + * <p>Virtual destructor.</p>
|
| + */
|
| + virtual ~KhmerBreakEngine();
|
| +
|
| + protected:
|
| + /**
|
| + * <p>Divide up a range of known dictionary characters.</p>
|
| + *
|
| + * @param text A UText representing the text
|
| + * @param rangeStart The start of the range of dictionary characters
|
| + * @param rangeEnd The end of the range of dictionary characters
|
| + * @param foundBreaks Output of C array of int32_t break positions, or 0
|
| + * @return The number of breaks found
|
| + */
|
| + virtual int32_t divideUpDictionaryRange( UText *text,
|
| + int32_t rangeStart,
|
| + int32_t rangeEnd,
|
| + UStack &foundBreaks ) const;
|
| +
|
| +};
|
| +
|
| +
|
| +U_NAMESPACE_END
|
| +
|
| + /* DICTBE_H */
|
| +#endif
|
|
|
| Property changes on: icu51/source/common/dictbe.h
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|