| Index: icu51/source/common/ucnv2022.cpp
|
| ===================================================================
|
| --- icu51/source/common/ucnv2022.cpp (revision 0)
|
| +++ icu51/source/common/ucnv2022.cpp (revision 0)
|
| @@ -0,0 +1,3951 @@
|
| +/*
|
| +**********************************************************************
|
| +* Copyright (C) 2000-2012, International Business Machines
|
| +* Corporation and others. All Rights Reserved.
|
| +**********************************************************************
|
| +* file name: ucnv2022.cpp
|
| +* encoding: US-ASCII
|
| +* tab size: 8 (not used)
|
| +* indentation:4
|
| +*
|
| +* created on: 2000feb03
|
| +* created by: Markus W. Scherer
|
| +*
|
| +* Change history:
|
| +*
|
| +* 06/29/2000 helena Major rewrite of the callback APIs.
|
| +* 08/08/2000 Ram Included support for ISO-2022-JP-2
|
| +* Changed implementation of toUnicode
|
| +* function
|
| +* 08/21/2000 Ram Added support for ISO-2022-KR
|
| +* 08/29/2000 Ram Seperated implementation of EBCDIC to
|
| +* ucnvebdc.c
|
| +* 09/20/2000 Ram Added support for ISO-2022-CN
|
| +* Added implementations for getNextUChar()
|
| +* for specific 2022 country variants.
|
| +* 10/31/2000 Ram Implemented offsets logic functions
|
| +*/
|
| +
|
| +#include "unicode/utypes.h"
|
| +
|
| +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
|
| +
|
| +#include "unicode/ucnv.h"
|
| +#include "unicode/uset.h"
|
| +#include "unicode/ucnv_err.h"
|
| +#include "unicode/ucnv_cb.h"
|
| +#include "unicode/utf16.h"
|
| +#include "ucnv_imp.h"
|
| +#include "ucnv_bld.h"
|
| +#include "ucnv_cnv.h"
|
| +#include "ucnvmbcs.h"
|
| +#include "cstring.h"
|
| +#include "cmemory.h"
|
| +#include "uassert.h"
|
| +
|
| +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
| +
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| +/*
|
| + * I am disabling the generic ISO-2022 converter after proposing to do so on
|
| + * the icu mailing list two days ago.
|
| + *
|
| + * Reasons:
|
| + * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
|
| + * its designation sequences, single shifts with return to the previous state,
|
| + * switch-with-no-return to UTF-16BE or similar, etc.
|
| + * This is unlike the language-specific variants like ISO-2022-JP which
|
| + * require a much smaller repertoire of ISO-2022 features.
|
| + * These variants continue to be supported.
|
| + * 2. I believe that no one is really using the generic ISO-2022 converter
|
| + * but rather always one of the language-specific variants.
|
| + * Note that ICU's generic ISO-2022 converter has always output one escape
|
| + * sequence followed by UTF-8 for the whole stream.
|
| + * 3. Switching between subcharsets is extremely slow, because each time
|
| + * the previous converter is closed and a new one opened,
|
| + * without any kind of caching, least-recently-used list, etc.
|
| + * 4. The code is currently buggy, and given the above it does not seem
|
| + * reasonable to spend the time on maintenance.
|
| + * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
|
| + * This means, for example, that when ISO-8859-7 is designated, the following
|
| + * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
|
| + * The ICU ISO-2022 converter does not handle this - and has no information
|
| + * about which subconverter would have to be shifted vs. which is designed
|
| + * for 7-bit ISO-2022.
|
| + *
|
| + * Markus Scherer 2003-dec-03
|
| + */
|
| +#endif
|
| +
|
| +static const char SHIFT_IN_STR[] = "\x0F";
|
| +// static const char SHIFT_OUT_STR[] = "\x0E";
|
| +
|
| +#define CR 0x0D
|
| +#define LF 0x0A
|
| +#define H_TAB 0x09
|
| +#define V_TAB 0x0B
|
| +#define SPACE 0x20
|
| +
|
| +enum {
|
| + HWKANA_START=0xff61,
|
| + HWKANA_END=0xff9f
|
| +};
|
| +
|
| +/*
|
| + * 94-character sets with native byte values A1..FE are encoded in ISO 2022
|
| + * as bytes 21..7E. (Subtract 0x80.)
|
| + * 96-character sets with native byte values A0..FF are encoded in ISO 2022
|
| + * as bytes 20..7F. (Subtract 0x80.)
|
| + * Do not encode C1 control codes with native bytes 80..9F
|
| + * as bytes 00..1F (C0 control codes).
|
| + */
|
| +enum {
|
| + GR94_START=0xa1,
|
| + GR94_END=0xfe,
|
| + GR96_START=0xa0,
|
| + GR96_END=0xff
|
| +};
|
| +
|
| +/*
|
| + * ISO 2022 control codes must not be converted from Unicode
|
| + * because they would mess up the byte stream.
|
| + * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
|
| + * corresponding to SO, SI, and ESC.
|
| + */
|
| +#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
|
| +
|
| +/* for ISO-2022-JP and -CN implementations */
|
| +typedef enum {
|
| + /* shared values */
|
| + INVALID_STATE=-1,
|
| + ASCII = 0,
|
| +
|
| + SS2_STATE=0x10,
|
| + SS3_STATE,
|
| +
|
| + /* JP */
|
| + ISO8859_1 = 1 ,
|
| + ISO8859_7 = 2 ,
|
| + JISX201 = 3,
|
| + JISX208 = 4,
|
| + JISX212 = 5,
|
| + GB2312 =6,
|
| + KSC5601 =7,
|
| + HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
|
| +
|
| + /* CN */
|
| + /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
|
| + GB2312_1=1,
|
| + ISO_IR_165=2,
|
| + CNS_11643=3,
|
| +
|
| + /*
|
| + * these are used in StateEnum and ISO2022State variables,
|
| + * but CNS_11643 must be used to index into myConverterArray[]
|
| + */
|
| + CNS_11643_0=0x20,
|
| + CNS_11643_1,
|
| + CNS_11643_2,
|
| + CNS_11643_3,
|
| + CNS_11643_4,
|
| + CNS_11643_5,
|
| + CNS_11643_6,
|
| + CNS_11643_7
|
| +} StateEnum;
|
| +
|
| +/* is the StateEnum charset value for a DBCS charset? */
|
| +#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
|
| +
|
| +#define CSM(cs) ((uint16_t)1<<(cs))
|
| +
|
| +/*
|
| + * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
|
| + * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
|
| + *
|
| + * Note: The converter uses some leniency:
|
| + * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
|
| + * all versions, not just JIS7 and JIS8.
|
| + * - ICU does not distinguish between different versions of JIS X 0208.
|
| + */
|
| +enum { MAX_JA_VERSION=4 };
|
| +static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
|
| + CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
|
| + CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
|
| + CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
|
| + CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
|
| + CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
|
| +};
|
| +
|
| +typedef enum {
|
| + ASCII1=0,
|
| + LATIN1,
|
| + SBCS,
|
| + DBCS,
|
| + MBCS,
|
| + HWKANA
|
| +}Cnv2022Type;
|
| +
|
| +typedef struct ISO2022State {
|
| + int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
|
| + int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
|
| + int8_t prevG; /* g before single shift (SS2 or SS3) */
|
| +} ISO2022State;
|
| +
|
| +#define UCNV_OPTIONS_VERSION_MASK 0xf
|
| +#define UCNV_2022_MAX_CONVERTERS 10
|
| +
|
| +typedef struct{
|
| + UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
|
| + UConverter *currentConverter;
|
| + Cnv2022Type currentType;
|
| + ISO2022State toU2022State, fromU2022State;
|
| + uint32_t key;
|
| + uint32_t version;
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| + UBool isFirstBuffer;
|
| +#endif
|
| + UBool isEmptySegment;
|
| + char name[30];
|
| + char locale[3];
|
| +}UConverterDataISO2022;
|
| +
|
| +/* Protos */
|
| +/* ISO-2022 ----------------------------------------------------------------- */
|
| +
|
| +/*Forward declaration */
|
| +U_CFUNC void
|
| +ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
|
| + UErrorCode * err);
|
| +U_CFUNC void
|
| +ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
|
| + UErrorCode * err);
|
| +
|
| +#define ESC_2022 0x1B /*ESC*/
|
| +
|
| +typedef enum
|
| +{
|
| + INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
|
| + VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
|
| + VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
|
| + VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
|
| +} UCNV_TableStates_2022;
|
| +
|
| +/*
|
| +* The way these state transition arrays work is:
|
| +* ex : ESC$B is the sequence for JISX208
|
| +* a) First Iteration: char is ESC
|
| +* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
|
| +* int x = normalize_esq_chars_2022[27] which is equal to 1
|
| +* ii) Search for this value in escSeqStateTable_Key_2022[]
|
| +* value of x is stored at escSeqStateTable_Key_2022[0]
|
| +* iii) Save this index as offset
|
| +* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
|
| +* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
|
| +* b) Switch on this state and continue to next char
|
| +* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
|
| +* which is normalize_esq_chars_2022[36] == 4
|
| +* ii) x is currently 1(from above)
|
| +* x<<=5 -- x is now 32
|
| +* x+=normalize_esq_chars_2022[36]
|
| +* now x is 36
|
| +* iii) Search for this value in escSeqStateTable_Key_2022[]
|
| +* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
|
| +* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
|
| +* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
|
| +* c) Switch on this state and continue to next char
|
| +* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
|
| +* ii) x is currently 36 (from above)
|
| +* x<<=5 -- x is now 1152
|
| +* x+=normalize_esq_chars_2022[66]
|
| +* now x is 1161
|
| +* iii) Search for this value in escSeqStateTable_Key_2022[]
|
| +* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
|
| +* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
|
| +* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
|
| +* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
|
| +*/
|
| +
|
| +
|
| +/*Below are the 3 arrays depicting a state transition table*/
|
| +static const int8_t normalize_esq_chars_2022[256] = {
|
| +/* 0 1 2 3 4 5 6 7 8 9 */
|
| +
|
| + 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
|
| + ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
|
| + ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
|
| + ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
| + ,0 ,0 ,0 ,0 ,0 ,0
|
| +};
|
| +
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| +/*
|
| + * When the generic ISO-2022 converter is completely removed, not just disabled
|
| + * per #ifdef, then the following state table and the associated tables that are
|
| + * dimensioned with MAX_STATES_2022 should be trimmed.
|
| + *
|
| + * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
|
| + * the associated escape sequences starting with ESC ( B should be removed.
|
| + * This includes the ones with key values 1097 and all of the ones above 1000000.
|
| + *
|
| + * For the latter, the tables can simply be truncated.
|
| + * For the former, since the tables must be kept parallel, it is probably best
|
| + * to simply duplicate an adjacent table cell, parallel in all tables.
|
| + *
|
| + * It may make sense to restructure the tables, especially by using small search
|
| + * tables for the variants instead of indexing them parallel to the table here.
|
| + */
|
| +#endif
|
| +
|
| +#define MAX_STATES_2022 74
|
| +static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
|
| +/* 0 1 2 3 4 5 6 7 8 9 */
|
| +
|
| + 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
|
| + ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
|
| + ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
|
| + ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
|
| + ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
|
| + ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
|
| + ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
|
| + ,35947631 ,35947635 ,35947636 ,35947638
|
| +};
|
| +
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| +
|
| +static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
|
| + /* 0 1 2 3 4 5 6 7 8 9 */
|
| +
|
| + NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
|
| + ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
|
| + ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
|
| + ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
|
| + ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
|
| + ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
|
| + ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
|
| + ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
|
| +};
|
| +
|
| +#endif
|
| +
|
| +static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
|
| +/* 0 1 2 3 4 5 6 7 8 9 */
|
| + VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
| + ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
| + ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
|
| + ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
| + ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
| + ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
| + ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
| + ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
| +};
|
| +
|
| +
|
| +/* Type def for refactoring changeState_2022 code*/
|
| +typedef enum{
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| + ISO_2022=0,
|
| +#endif
|
| + ISO_2022_JP=1,
|
| + ISO_2022_KR=2,
|
| + ISO_2022_CN=3
|
| +} Variant2022;
|
| +
|
| +/*********** ISO 2022 Converter Protos ***********/
|
| +static void
|
| +_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
|
| +
|
| +static void
|
| + _ISO2022Close(UConverter *converter);
|
| +
|
| +static void
|
| +_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
|
| +
|
| +static const char*
|
| +_ISO2022getName(const UConverter* cnv);
|
| +
|
| +static void
|
| +_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
|
| +
|
| +static UConverter *
|
| +_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
|
| +
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| +static void
|
| +T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
|
| +#endif
|
| +
|
| +namespace {
|
| +
|
| +/*const UConverterSharedData _ISO2022Data;*/
|
| +extern const UConverterSharedData _ISO2022JPData;
|
| +extern const UConverterSharedData _ISO2022KRData;
|
| +extern const UConverterSharedData _ISO2022CNData;
|
| +
|
| +} // namespace
|
| +
|
| +/*************** Converter implementations ******************/
|
| +
|
| +/* The purpose of this function is to get around gcc compiler warnings. */
|
| +static inline void
|
| +fromUWriteUInt8(UConverter *cnv,
|
| + const char *bytes, int32_t length,
|
| + uint8_t **target, const char *targetLimit,
|
| + int32_t **offsets,
|
| + int32_t sourceIndex,
|
| + UErrorCode *pErrorCode)
|
| +{
|
| + char *targetChars = (char *)*target;
|
| + ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
|
| + offsets, sourceIndex, pErrorCode);
|
| + *target = (uint8_t*)targetChars;
|
| +
|
| +}
|
| +
|
| +static inline void
|
| +setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
|
| + if(myConverterData->version == 1) {
|
| + UConverter *cnv = myConverterData->currentConverter;
|
| +
|
| + cnv->toUnicodeStatus=0; /* offset */
|
| + cnv->mode=0; /* state */
|
| + cnv->toULength=0; /* byteIndex */
|
| + }
|
| +}
|
| +
|
| +static inline void
|
| +setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
|
| + /* in ISO-2022-KR the designator sequence appears only once
|
| + * in a file so we append it only once
|
| + */
|
| + if( converter->charErrorBufferLength==0){
|
| +
|
| + converter->charErrorBufferLength = 4;
|
| + converter->charErrorBuffer[0] = 0x1b;
|
| + converter->charErrorBuffer[1] = 0x24;
|
| + converter->charErrorBuffer[2] = 0x29;
|
| + converter->charErrorBuffer[3] = 0x43;
|
| + }
|
| + if(myConverterData->version == 1) {
|
| + UConverter *cnv = myConverterData->currentConverter;
|
| +
|
| + cnv->fromUChar32=0;
|
| + cnv->fromUnicodeStatus=1; /* prevLength */
|
| + }
|
| +}
|
| +
|
| +static void
|
| +_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
|
| +
|
| + char myLocale[6]={' ',' ',' ',' ',' ',' '};
|
| +
|
| + cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
|
| + if(cnv->extraInfo != NULL) {
|
| + UConverterNamePieces stackPieces;
|
| + UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
|
| + UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
|
| + uint32_t version;
|
| +
|
| + stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
|
| +
|
| + uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
|
| + myConverterData->currentType = ASCII1;
|
| + cnv->fromUnicodeStatus =FALSE;
|
| + if(pArgs->locale){
|
| + uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
|
| + }
|
| + version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
|
| + myConverterData->version = version;
|
| + if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
|
| + (myLocale[2]=='_' || myLocale[2]=='\0'))
|
| + {
|
| + size_t len=0;
|
| + /* open the required converters and cache them */
|
| + if(version>MAX_JA_VERSION) {
|
| + /* prevent indexing beyond jpCharsetMasks[] */
|
| + myConverterData->version = version = 0;
|
| + }
|
| + if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
|
| + myConverterData->myConverterArray[ISO8859_7] =
|
| + ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
|
| + }
|
| + myConverterData->myConverterArray[JISX208] =
|
| + ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
|
| + if(jpCharsetMasks[version]&CSM(JISX212)) {
|
| + myConverterData->myConverterArray[JISX212] =
|
| + ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
|
| + }
|
| + if(jpCharsetMasks[version]&CSM(GB2312)) {
|
| + myConverterData->myConverterArray[GB2312] =
|
| + ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
|
| + }
|
| + if(jpCharsetMasks[version]&CSM(KSC5601)) {
|
| + myConverterData->myConverterArray[KSC5601] =
|
| + ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
|
| + }
|
| +
|
| + /* set the function pointers to appropriate funtions */
|
| + cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
|
| + uprv_strcpy(myConverterData->locale,"ja");
|
| +
|
| + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
|
| + len = uprv_strlen(myConverterData->name);
|
| + myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
|
| + myConverterData->name[len+1]='\0';
|
| + }
|
| + else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
|
| + (myLocale[2]=='_' || myLocale[2]=='\0'))
|
| + {
|
| + const char *cnvName;
|
| + if(version==1) {
|
| + cnvName="icu-internal-25546";
|
| + } else {
|
| + cnvName="ibm-949";
|
| + myConverterData->version=version=0;
|
| + }
|
| + if(pArgs->onlyTestIsLoadable) {
|
| + ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
|
| + uprv_free(cnv->extraInfo);
|
| + cnv->extraInfo=NULL;
|
| + return;
|
| + } else {
|
| + myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
|
| + if (U_FAILURE(*errorCode)) {
|
| + _ISO2022Close(cnv);
|
| + return;
|
| + }
|
| +
|
| + if(version==1) {
|
| + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
|
| + uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
|
| + cnv->subCharLen = myConverterData->currentConverter->subCharLen;
|
| + }else{
|
| + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
|
| + }
|
| +
|
| + /* initialize the state variables */
|
| + setInitialStateToUnicodeKR(cnv, myConverterData);
|
| + setInitialStateFromUnicodeKR(cnv, myConverterData);
|
| +
|
| + /* set the function pointers to appropriate funtions */
|
| + cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
|
| + uprv_strcpy(myConverterData->locale,"ko");
|
| + }
|
| + }
|
| + else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
|
| + (myLocale[2]=='_' || myLocale[2]=='\0'))
|
| + {
|
| +
|
| + /* open the required converters and cache them */
|
| + myConverterData->myConverterArray[GB2312_1] =
|
| + ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
|
| + if(version==1) {
|
| + myConverterData->myConverterArray[ISO_IR_165] =
|
| + ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
|
| + }
|
| + myConverterData->myConverterArray[CNS_11643] =
|
| + ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
|
| +
|
| +
|
| + /* set the function pointers to appropriate funtions */
|
| + cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
|
| + uprv_strcpy(myConverterData->locale,"cn");
|
| +
|
| + if (version==0){
|
| + myConverterData->version = 0;
|
| + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
|
| + }else if (version==1){
|
| + myConverterData->version = 1;
|
| + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
|
| + }else {
|
| + myConverterData->version = 2;
|
| + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
|
| + }
|
| + }
|
| + else{
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| + myConverterData->isFirstBuffer = TRUE;
|
| +
|
| + /* append the UTF-8 escape sequence */
|
| + cnv->charErrorBufferLength = 3;
|
| + cnv->charErrorBuffer[0] = 0x1b;
|
| + cnv->charErrorBuffer[1] = 0x25;
|
| + cnv->charErrorBuffer[2] = 0x42;
|
| +
|
| + cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
|
| + /* initialize the state variables */
|
| + uprv_strcpy(myConverterData->name,"ISO_2022");
|
| +#else
|
| + *errorCode = U_UNSUPPORTED_ERROR;
|
| + return;
|
| +#endif
|
| + }
|
| +
|
| + cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
|
| +
|
| + if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
|
| + _ISO2022Close(cnv);
|
| + }
|
| + } else {
|
| + *errorCode = U_MEMORY_ALLOCATION_ERROR;
|
| + }
|
| +}
|
| +
|
| +
|
| +static void
|
| +_ISO2022Close(UConverter *converter) {
|
| + UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
|
| + UConverterSharedData **array = myData->myConverterArray;
|
| + int32_t i;
|
| +
|
| + if (converter->extraInfo != NULL) {
|
| + /*close the array of converter pointers and free the memory*/
|
| + for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
|
| + if(array[i]!=NULL) {
|
| + ucnv_unloadSharedDataIfReady(array[i]);
|
| + }
|
| + }
|
| +
|
| + ucnv_close(myData->currentConverter);
|
| +
|
| + if(!converter->isExtraLocal){
|
| + uprv_free (converter->extraInfo);
|
| + converter->extraInfo = NULL;
|
| + }
|
| + }
|
| +}
|
| +
|
| +static void
|
| +_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
|
| + UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
|
| + if(choice<=UCNV_RESET_TO_UNICODE) {
|
| + uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
|
| + myConverterData->key = 0;
|
| + myConverterData->isEmptySegment = FALSE;
|
| + }
|
| + if(choice!=UCNV_RESET_TO_UNICODE) {
|
| + uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
|
| + }
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| + if(myConverterData->locale[0] == 0){
|
| + if(choice<=UCNV_RESET_TO_UNICODE) {
|
| + myConverterData->isFirstBuffer = TRUE;
|
| + myConverterData->key = 0;
|
| + if (converter->mode == UCNV_SO){
|
| + ucnv_close (myConverterData->currentConverter);
|
| + myConverterData->currentConverter=NULL;
|
| + }
|
| + converter->mode = UCNV_SI;
|
| + }
|
| + if(choice!=UCNV_RESET_TO_UNICODE) {
|
| + /* re-append UTF-8 escape sequence */
|
| + converter->charErrorBufferLength = 3;
|
| + converter->charErrorBuffer[0] = 0x1b;
|
| + converter->charErrorBuffer[1] = 0x28;
|
| + converter->charErrorBuffer[2] = 0x42;
|
| + }
|
| + }
|
| + else
|
| +#endif
|
| + {
|
| + /* reset the state variables */
|
| + if(myConverterData->locale[0] == 'k'){
|
| + if(choice<=UCNV_RESET_TO_UNICODE) {
|
| + setInitialStateToUnicodeKR(converter, myConverterData);
|
| + }
|
| + if(choice!=UCNV_RESET_TO_UNICODE) {
|
| + setInitialStateFromUnicodeKR(converter, myConverterData);
|
| + }
|
| + }
|
| + }
|
| +}
|
| +
|
| +static const char*
|
| +_ISO2022getName(const UConverter* cnv){
|
| + if(cnv->extraInfo){
|
| + UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
|
| + return myData->name;
|
| + }
|
| + return NULL;
|
| +}
|
| +
|
| +
|
| +/*************** to unicode *******************/
|
| +/****************************************************************************
|
| + * Recognized escape sequences are
|
| + * <ESC>(B ASCII
|
| + * <ESC>.A ISO-8859-1
|
| + * <ESC>.F ISO-8859-7
|
| + * <ESC>(J JISX-201
|
| + * <ESC>(I JISX-201
|
| + * <ESC>$B JISX-208
|
| + * <ESC>$@ JISX-208
|
| + * <ESC>$(D JISX-212
|
| + * <ESC>$A GB2312
|
| + * <ESC>$(C KSC5601
|
| + */
|
| +static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
|
| +/* 0 1 2 3 4 5 6 7 8 9 */
|
| + INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| +};
|
| +
|
| +/*************** to unicode *******************/
|
| +static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
|
| +/* 0 1 2 3 4 5 6 7 8 9 */
|
| + INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
|
| + ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
| +};
|
| +
|
| +
|
| +static UCNV_TableStates_2022
|
| +getKey_2022(char c,int32_t* key,int32_t* offset){
|
| + int32_t togo;
|
| + int32_t low = 0;
|
| + int32_t hi = MAX_STATES_2022;
|
| + int32_t oldmid=0;
|
| +
|
| + togo = normalize_esq_chars_2022[(uint8_t)c];
|
| + if(togo == 0) {
|
| + /* not a valid character anywhere in an escape sequence */
|
| + *key = 0;
|
| + *offset = 0;
|
| + return INVALID_2022;
|
| + }
|
| + togo = (*key << 5) + togo;
|
| +
|
| + while (hi != low) /*binary search*/{
|
| +
|
| + register int32_t mid = (hi+low) >> 1; /*Finds median*/
|
| +
|
| + if (mid == oldmid)
|
| + break;
|
| +
|
| + if (escSeqStateTable_Key_2022[mid] > togo){
|
| + hi = mid;
|
| + }
|
| + else if (escSeqStateTable_Key_2022[mid] < togo){
|
| + low = mid;
|
| + }
|
| + else /*we found it*/{
|
| + *key = togo;
|
| + *offset = mid;
|
| + return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
|
| + }
|
| + oldmid = mid;
|
| +
|
| + }
|
| +
|
| + *key = 0;
|
| + *offset = 0;
|
| + return INVALID_2022;
|
| +}
|
| +
|
| +/*runs through a state machine to determine the escape sequence - codepage correspondance
|
| + */
|
| +static void
|
| +changeState_2022(UConverter* _this,
|
| + const char** source,
|
| + const char* sourceLimit,
|
| + Variant2022 var,
|
| + UErrorCode* err){
|
| + UCNV_TableStates_2022 value;
|
| + UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
|
| + uint32_t key = myData2022->key;
|
| + int32_t offset = 0;
|
| + int8_t initialToULength = _this->toULength;
|
| + char c;
|
| +
|
| + value = VALID_NON_TERMINAL_2022;
|
| + while (*source < sourceLimit) {
|
| + c = *(*source)++;
|
| + _this->toUBytes[_this->toULength++]=(uint8_t)c;
|
| + value = getKey_2022(c,(int32_t *) &key, &offset);
|
| +
|
| + switch (value){
|
| +
|
| + case VALID_NON_TERMINAL_2022 :
|
| + /* continue with the loop */
|
| + break;
|
| +
|
| + case VALID_TERMINAL_2022:
|
| + key = 0;
|
| + goto DONE;
|
| +
|
| + case INVALID_2022:
|
| + goto DONE;
|
| +
|
| + case VALID_MAYBE_TERMINAL_2022:
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| + /* ESC ( B is ambiguous only for ISO_2022 itself */
|
| + if(var == ISO_2022) {
|
| + /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
|
| + _this->toULength = 0;
|
| +
|
| + /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
|
| +
|
| + /* continue with the loop */
|
| + value = VALID_NON_TERMINAL_2022;
|
| + break;
|
| + } else
|
| +#endif
|
| + {
|
| + /* not ISO_2022 itself, finish here */
|
| + value = VALID_TERMINAL_2022;
|
| + key = 0;
|
| + goto DONE;
|
| + }
|
| + }
|
| + }
|
| +
|
| +DONE:
|
| + myData2022->key = key;
|
| +
|
| + if (value == VALID_NON_TERMINAL_2022) {
|
| + /* indicate that the escape sequence is incomplete: key!=0 */
|
| + return;
|
| + } else if (value == INVALID_2022 ) {
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + } else /* value == VALID_TERMINAL_2022 */ {
|
| + switch(var){
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| + case ISO_2022:
|
| + {
|
| + const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
|
| + if(chosenConverterName == NULL) {
|
| + /* SS2 or SS3 */
|
| + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
| + _this->toUCallbackReason = UCNV_UNASSIGNED;
|
| + return;
|
| + }
|
| +
|
| + _this->mode = UCNV_SI;
|
| + ucnv_close(myData2022->currentConverter);
|
| + myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
|
| + if(U_SUCCESS(*err)) {
|
| + myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
|
| + _this->mode = UCNV_SO;
|
| + }
|
| + break;
|
| + }
|
| +#endif
|
| + case ISO_2022_JP:
|
| + {
|
| + StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
|
| + switch(tempState) {
|
| + case INVALID_STATE:
|
| + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
| + break;
|
| + case SS2_STATE:
|
| + if(myData2022->toU2022State.cs[2]!=0) {
|
| + if(myData2022->toU2022State.g<2) {
|
| + myData2022->toU2022State.prevG=myData2022->toU2022State.g;
|
| + }
|
| + myData2022->toU2022State.g=2;
|
| + } else {
|
| + /* illegal to have SS2 before a matching designator */
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + }
|
| + break;
|
| + /* case SS3_STATE: not used in ISO-2022-JP-x */
|
| + case ISO8859_1:
|
| + case ISO8859_7:
|
| + if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
|
| + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
| + } else {
|
| + /* G2 charset for SS2 */
|
| + myData2022->toU2022State.cs[2]=(int8_t)tempState;
|
| + }
|
| + break;
|
| + default:
|
| + if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
|
| + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
| + } else {
|
| + /* G0 charset */
|
| + myData2022->toU2022State.cs[0]=(int8_t)tempState;
|
| + }
|
| + break;
|
| + }
|
| + }
|
| + break;
|
| + case ISO_2022_CN:
|
| + {
|
| + StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
|
| + switch(tempState) {
|
| + case INVALID_STATE:
|
| + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
| + break;
|
| + case SS2_STATE:
|
| + if(myData2022->toU2022State.cs[2]!=0) {
|
| + if(myData2022->toU2022State.g<2) {
|
| + myData2022->toU2022State.prevG=myData2022->toU2022State.g;
|
| + }
|
| + myData2022->toU2022State.g=2;
|
| + } else {
|
| + /* illegal to have SS2 before a matching designator */
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + }
|
| + break;
|
| + case SS3_STATE:
|
| + if(myData2022->toU2022State.cs[3]!=0) {
|
| + if(myData2022->toU2022State.g<2) {
|
| + myData2022->toU2022State.prevG=myData2022->toU2022State.g;
|
| + }
|
| + myData2022->toU2022State.g=3;
|
| + } else {
|
| + /* illegal to have SS3 before a matching designator */
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + }
|
| + break;
|
| + case ISO_IR_165:
|
| + if(myData2022->version==0) {
|
| + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
| + break;
|
| + }
|
| + /*fall through*/
|
| + case GB2312_1:
|
| + /*fall through*/
|
| + case CNS_11643_1:
|
| + myData2022->toU2022State.cs[1]=(int8_t)tempState;
|
| + break;
|
| + case CNS_11643_2:
|
| + myData2022->toU2022State.cs[2]=(int8_t)tempState;
|
| + break;
|
| + default:
|
| + /* other CNS 11643 planes */
|
| + if(myData2022->version==0) {
|
| + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
| + } else {
|
| + myData2022->toU2022State.cs[3]=(int8_t)tempState;
|
| + }
|
| + break;
|
| + }
|
| + }
|
| + break;
|
| + case ISO_2022_KR:
|
| + if(offset==0x30){
|
| + /* nothing to be done, just accept this one escape sequence */
|
| + } else {
|
| + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
| + }
|
| + break;
|
| +
|
| + default:
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + break;
|
| + }
|
| + }
|
| + if(U_SUCCESS(*err)) {
|
| + _this->toULength = 0;
|
| + } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
|
| + if(_this->toULength>1) {
|
| + /*
|
| + * Ticket 5691: consistent illegal sequences:
|
| + * - We include at least the first byte (ESC) in the illegal sequence.
|
| + * - If any of the non-initial bytes could be the start of a character,
|
| + * we stop the illegal sequence before the first one of those.
|
| + * In escape sequences, all following bytes are "printable", that is,
|
| + * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
|
| + * they are valid single/lead bytes.
|
| + * For simplicity, we always only report the initial ESC byte as the
|
| + * illegal sequence and back out all other bytes we looked at.
|
| + */
|
| + /* Back out some bytes. */
|
| + int8_t backOutDistance=_this->toULength-1;
|
| + int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
|
| + if(backOutDistance<=bytesFromThisBuffer) {
|
| + /* same as initialToULength<=1 */
|
| + *source-=backOutDistance;
|
| + } else {
|
| + /* Back out bytes from the previous buffer: Need to replay them. */
|
| + _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
|
| + /* same as -(initialToULength-1) */
|
| + /* preToULength is negative! */
|
| + uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
|
| + *source-=bytesFromThisBuffer;
|
| + }
|
| + _this->toULength=1;
|
| + }
|
| + } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
|
| + _this->toUCallbackReason = UCNV_UNASSIGNED;
|
| + }
|
| +}
|
| +
|
| +/*Checks the characters of the buffer against valid 2022 escape sequences
|
| +*if the match we return a pointer to the initial start of the sequence otherwise
|
| +*we return sourceLimit
|
| +*/
|
| +/*for 2022 looks ahead in the stream
|
| + *to determine the longest possible convertible
|
| + *data stream
|
| + */
|
| +static inline const char*
|
| +getEndOfBuffer_2022(const char** source,
|
| + const char* sourceLimit,
|
| + UBool /*flush*/){
|
| +
|
| + const char* mySource = *source;
|
| +
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| + if (*source >= sourceLimit)
|
| + return sourceLimit;
|
| +
|
| + do{
|
| +
|
| + if (*mySource == ESC_2022){
|
| + int8_t i;
|
| + int32_t key = 0;
|
| + int32_t offset;
|
| + UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
|
| +
|
| + /* Kludge: I could not
|
| + * figure out the reason for validating an escape sequence
|
| + * twice - once here and once in changeState_2022().
|
| + * is it possible to have an ESC character in a ISO2022
|
| + * byte stream which is valid in a code page? Is it legal?
|
| + */
|
| + for (i=0;
|
| + (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
|
| + i++) {
|
| + value = getKey_2022(*(mySource+i), &key, &offset);
|
| + }
|
| + if (value > 0 || *mySource==ESC_2022)
|
| + return mySource;
|
| +
|
| + if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
|
| + return sourceLimit;
|
| + }
|
| + }while (++mySource < sourceLimit);
|
| +
|
| + return sourceLimit;
|
| +#else
|
| + while(mySource < sourceLimit && *mySource != ESC_2022) {
|
| + ++mySource;
|
| + }
|
| + return mySource;
|
| +#endif
|
| +}
|
| +
|
| +
|
| +/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
|
| + * any future change in _MBCSFromUChar32() function should be reflected here.
|
| + * @return number of bytes in *value; negative number if fallback; 0 if no mapping
|
| + */
|
| +static inline int32_t
|
| +MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
|
| + UChar32 c,
|
| + uint32_t* value,
|
| + UBool useFallback,
|
| + int outputType)
|
| +{
|
| + const int32_t *cx;
|
| + const uint16_t *table;
|
| + uint32_t stage2Entry;
|
| + uint32_t myValue;
|
| + int32_t length;
|
| + const uint8_t *p;
|
| + /*
|
| + * TODO(markus): Use and require new, faster MBCS conversion table structures.
|
| + * Use internal version of ucnv_open() that verifies that the new structures are available,
|
| + * else U_INTERNAL_PROGRAM_ERROR.
|
| + */
|
| + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
| + if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
|
| + table=sharedData->mbcs.fromUnicodeTable;
|
| + stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
|
| + /* get the bytes and the length for the output */
|
| + if(outputType==MBCS_OUTPUT_2){
|
| + myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
|
| + if(myValue<=0xff) {
|
| + length=1;
|
| + } else {
|
| + length=2;
|
| + }
|
| + } else /* outputType==MBCS_OUTPUT_3 */ {
|
| + p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
|
| + myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
|
| + if(myValue<=0xff) {
|
| + length=1;
|
| + } else if(myValue<=0xffff) {
|
| + length=2;
|
| + } else {
|
| + length=3;
|
| + }
|
| + }
|
| + /* is this code point assigned, or do we use fallbacks? */
|
| + if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
|
| + /* assigned */
|
| + *value=myValue;
|
| + return length;
|
| + } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
|
| + /*
|
| + * We allow a 0 byte output if the "assigned" bit is set for this entry.
|
| + * There is no way with this data structure for fallback output
|
| + * to be a zero byte.
|
| + */
|
| + *value=myValue;
|
| + return -length;
|
| + }
|
| + }
|
| +
|
| + cx=sharedData->mbcs.extIndexes;
|
| + if(cx!=NULL) {
|
| + return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
|
| + }
|
| +
|
| + /* unassigned */
|
| + return 0;
|
| +}
|
| +
|
| +/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
|
| + * any future change in _MBCSSingleFromUChar32() function should be reflected here.
|
| + * @param retval pointer to output byte
|
| + * @return 1 roundtrip byte 0 no mapping -1 fallback byte
|
| + */
|
| +static inline int32_t
|
| +MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
|
| + UChar32 c,
|
| + uint32_t* retval,
|
| + UBool useFallback)
|
| +{
|
| + const uint16_t *table;
|
| + int32_t value;
|
| + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
| + if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
|
| + return 0;
|
| + }
|
| + /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
|
| + table=sharedData->mbcs.fromUnicodeTable;
|
| + /* get the byte for the output */
|
| + value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
|
| + /* is this code point assigned, or do we use fallbacks? */
|
| + *retval=(uint32_t)(value&0xff);
|
| + if(value>=0xf00) {
|
| + return 1; /* roundtrip */
|
| + } else if(useFallback ? value>=0x800 : value>=0xc00) {
|
| + return -1; /* fallback taken */
|
| + } else {
|
| + return 0; /* no mapping */
|
| + }
|
| +}
|
| +
|
| +/*
|
| + * Check that the result is a 2-byte value with each byte in the range A1..FE
|
| + * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
|
| + * to move it to the ISO 2022 range 21..7E.
|
| + * Return 0 if out of range.
|
| + */
|
| +static inline uint32_t
|
| +_2022FromGR94DBCS(uint32_t value) {
|
| + if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
|
| + (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
|
| + ) {
|
| + return value - 0x8080; /* shift down to 21..7e byte range */
|
| + } else {
|
| + return 0; /* not valid for ISO 2022 */
|
| + }
|
| +}
|
| +
|
| +#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
|
| +/*
|
| + * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
|
| + * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
|
| + * unchanged.
|
| + */
|
| +static inline uint32_t
|
| +_2022ToGR94DBCS(uint32_t value) {
|
| + uint32_t returnValue = value + 0x8080;
|
| + if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
|
| + (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
|
| + return returnValue;
|
| + } else {
|
| + return value;
|
| + }
|
| +}
|
| +#endif
|
| +
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| +
|
| +/**********************************************************************************
|
| +* ISO-2022 Converter
|
| +*
|
| +*
|
| +*/
|
| +
|
| +static void
|
| +T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
|
| + UErrorCode* err){
|
| + const char* mySourceLimit, *realSourceLimit;
|
| + const char* sourceStart;
|
| + const UChar* myTargetStart;
|
| + UConverter* saveThis;
|
| + UConverterDataISO2022* myData;
|
| + int8_t length;
|
| +
|
| + saveThis = args->converter;
|
| + myData=((UConverterDataISO2022*)(saveThis->extraInfo));
|
| +
|
| + realSourceLimit = args->sourceLimit;
|
| + while (args->source < realSourceLimit) {
|
| + if(myData->key == 0) { /* are we in the middle of an escape sequence? */
|
| + /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
|
| + mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
|
| +
|
| + if(args->source < mySourceLimit) {
|
| + if(myData->currentConverter==NULL) {
|
| + myData->currentConverter = ucnv_open("ASCII",err);
|
| + if(U_FAILURE(*err)){
|
| + return;
|
| + }
|
| +
|
| + myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
|
| + saveThis->mode = UCNV_SO;
|
| + }
|
| +
|
| + /* convert to before the ESC or until the end of the buffer */
|
| + myData->isFirstBuffer=FALSE;
|
| + sourceStart = args->source;
|
| + myTargetStart = args->target;
|
| + args->converter = myData->currentConverter;
|
| + ucnv_toUnicode(args->converter,
|
| + &args->target,
|
| + args->targetLimit,
|
| + &args->source,
|
| + mySourceLimit,
|
| + args->offsets,
|
| + (UBool)(args->flush && mySourceLimit == realSourceLimit),
|
| + err);
|
| + args->converter = saveThis;
|
| +
|
| + if (*err == U_BUFFER_OVERFLOW_ERROR) {
|
| + /* move the overflow buffer */
|
| + length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
|
| + myData->currentConverter->UCharErrorBufferLength = 0;
|
| + if(length > 0) {
|
| + uprv_memcpy(saveThis->UCharErrorBuffer,
|
| + myData->currentConverter->UCharErrorBuffer,
|
| + length*U_SIZEOF_UCHAR);
|
| + }
|
| + return;
|
| + }
|
| +
|
| + /*
|
| + * At least one of:
|
| + * -Error while converting
|
| + * -Done with entire buffer
|
| + * -Need to write offsets or update the current offset
|
| + * (leave that up to the code in ucnv.c)
|
| + *
|
| + * or else we just stopped at an ESC byte and continue with changeState_2022()
|
| + */
|
| + if (U_FAILURE(*err) ||
|
| + (args->source == realSourceLimit) ||
|
| + (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
|
| + (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
|
| + ) {
|
| + /* copy partial or error input for truncated detection and error handling */
|
| + if(U_FAILURE(*err)) {
|
| + length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
|
| + if(length > 0) {
|
| + uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
|
| + }
|
| + } else {
|
| + length = saveThis->toULength = myData->currentConverter->toULength;
|
| + if(length > 0) {
|
| + uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
|
| + if(args->source < mySourceLimit) {
|
| + *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
|
| + }
|
| + }
|
| + }
|
| + return;
|
| + }
|
| + }
|
| + }
|
| +
|
| + sourceStart = args->source;
|
| + changeState_2022(args->converter,
|
| + &(args->source),
|
| + realSourceLimit,
|
| + ISO_2022,
|
| + err);
|
| + if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
|
| + /* let the ucnv.c code update its current offset */
|
| + return;
|
| + }
|
| + }
|
| +}
|
| +
|
| +#endif
|
| +
|
| +/*
|
| + * To Unicode Callback helper function
|
| + */
|
| +static void
|
| +toUnicodeCallback(UConverter *cnv,
|
| + const uint32_t sourceChar, const uint32_t targetUniChar,
|
| + UErrorCode* err){
|
| + if(sourceChar>0xff){
|
| + cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
|
| + cnv->toUBytes[1] = (uint8_t)sourceChar;
|
| + cnv->toULength = 2;
|
| + }
|
| + else{
|
| + cnv->toUBytes[0] =(char) sourceChar;
|
| + cnv->toULength = 1;
|
| + }
|
| +
|
| + if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
|
| + *err = U_INVALID_CHAR_FOUND;
|
| + }
|
| + else{
|
| + *err = U_ILLEGAL_CHAR_FOUND;
|
| + }
|
| +}
|
| +
|
| +/**************************************ISO-2022-JP*************************************************/
|
| +
|
| +/************************************** IMPORTANT **************************************************
|
| +* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
|
| +* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
|
| +* The converter iterates over each Unicode codepoint
|
| +* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
|
| +* processed one char at a time it would make sense to reduce the extra processing a canned converter
|
| +* would do as far as possible.
|
| +*
|
| +* If the implementation of these macros or structure of sharedData struct change in the future, make
|
| +* sure that ISO-2022 is also changed.
|
| +***************************************************************************************************
|
| +*/
|
| +
|
| +/***************************************************************************************************
|
| +* Rules for ISO-2022-jp encoding
|
| +* (i) Escape sequences must be fully contained within a line they should not
|
| +* span new lines or CRs
|
| +* (ii) If the last character on a line is represented by two bytes then an ASCII or
|
| +* JIS-Roman character escape sequence should follow before the line terminates
|
| +* (iii) If the first character on the line is represented by two bytes then a two
|
| +* byte character escape sequence should precede it
|
| +* (iv) If no escape sequence is encountered then the characters are ASCII
|
| +* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
|
| +* and invoked with SS2 (ESC N).
|
| +* (vi) If there is any G0 designation in text, there must be a switch to
|
| +* ASCII or to JIS X 0201-Roman before a space character (but not
|
| +* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
|
| +* characters such as tab or CRLF.
|
| +* (vi) Supported encodings:
|
| +* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
|
| +*
|
| +* source : RFC-1554
|
| +*
|
| +* JISX201, JISX208,JISX212 : new .cnv data files created
|
| +* KSC5601 : alias to ibm-949 mapping table
|
| +* GB2312 : alias to ibm-1386 mapping table
|
| +* ISO-8859-1 : Algorithmic implemented as LATIN1 case
|
| +* ISO-8859-7 : alisas to ibm-9409 mapping table
|
| +*/
|
| +
|
| +/* preference order of JP charsets */
|
| +static const StateEnum jpCharsetPref[]={
|
| + ASCII,
|
| + JISX201,
|
| + ISO8859_1,
|
| + ISO8859_7,
|
| + JISX208,
|
| + JISX212,
|
| + GB2312,
|
| + KSC5601,
|
| + HWKANA_7BIT
|
| +};
|
| +
|
| +/*
|
| + * The escape sequences must be in order of the enum constants like JISX201 = 3,
|
| + * not in order of jpCharsetPref[]!
|
| + */
|
| +static const char escSeqChars[][6] ={
|
| + "\x1B\x28\x42", /* <ESC>(B ASCII */
|
| + "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
|
| + "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
|
| + "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
|
| + "\x1B\x24\x42", /* <ESC>$B JISX-208 */
|
| + "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
|
| + "\x1B\x24\x41", /* <ESC>$A GB2312 */
|
| + "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
|
| + "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
|
| +
|
| +};
|
| +static const int8_t escSeqCharsLen[] ={
|
| + 3, /* length of <ESC>(B ASCII */
|
| + 3, /* length of <ESC>.A ISO-8859-1 */
|
| + 3, /* length of <ESC>.F ISO-8859-7 */
|
| + 3, /* length of <ESC>(J JISX-201 */
|
| + 3, /* length of <ESC>$B JISX-208 */
|
| + 4, /* length of <ESC>$(D JISX-212 */
|
| + 3, /* length of <ESC>$A GB2312 */
|
| + 4, /* length of <ESC>$(C KSC5601 */
|
| + 3 /* length of <ESC>(I HWKANA_7BIT */
|
| +};
|
| +
|
| +/*
|
| +* The iteration over various code pages works this way:
|
| +* i) Get the currentState from myConverterData->currentState
|
| +* ii) Check if the character is mapped to a valid character in the currentState
|
| +* Yes -> a) set the initIterState to currentState
|
| +* b) remain in this state until an invalid character is found
|
| +* No -> a) go to the next code page and find the character
|
| +* iii) Before changing the state increment the current state check if the current state
|
| +* is equal to the intitIteration state
|
| +* Yes -> A character that cannot be represented in any of the supported encodings
|
| +* break and return a U_INVALID_CHARACTER error
|
| +* No -> Continue and find the character in next code page
|
| +*
|
| +*
|
| +* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
|
| +*/
|
| +
|
| +/* Map 00..7F to Unicode according to JIS X 0201. */
|
| +static inline uint32_t
|
| +jisx201ToU(uint32_t value) {
|
| + if(value < 0x5c) {
|
| + return value;
|
| + } else if(value == 0x5c) {
|
| + return 0xa5;
|
| + } else if(value == 0x7e) {
|
| + return 0x203e;
|
| + } else /* value <= 0x7f */ {
|
| + return value;
|
| + }
|
| +}
|
| +
|
| +/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
|
| +static inline uint32_t
|
| +jisx201FromU(uint32_t value) {
|
| + if(value<=0x7f) {
|
| + if(value!=0x5c && value!=0x7e) {
|
| + return value;
|
| + }
|
| + } else if(value==0xa5) {
|
| + return 0x5c;
|
| + } else if(value==0x203e) {
|
| + return 0x7e;
|
| + }
|
| + return 0xfffe;
|
| +}
|
| +
|
| +/*
|
| + * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
|
| + * to JIS X 0208, and convert it to a pair of 21..7E bytes.
|
| + * Return 0 if the byte pair is out of range.
|
| + */
|
| +static inline uint32_t
|
| +_2022FromSJIS(uint32_t value) {
|
| + uint8_t trail;
|
| +
|
| + if(value > 0xEFFC) {
|
| + return 0; /* beyond JIS X 0208 */
|
| + }
|
| +
|
| + trail = (uint8_t)value;
|
| +
|
| + value &= 0xff00; /* lead byte */
|
| + if(value <= 0x9f00) {
|
| + value -= 0x7000;
|
| + } else /* 0xe000 <= value <= 0xef00 */ {
|
| + value -= 0xb000;
|
| + }
|
| + value <<= 1;
|
| +
|
| + if(trail <= 0x9e) {
|
| + value -= 0x100;
|
| + if(trail <= 0x7e) {
|
| + value |= trail - 0x1f;
|
| + } else {
|
| + value |= trail - 0x20;
|
| + }
|
| + } else /* trail <= 0xfc */ {
|
| + value |= trail - 0x7e;
|
| + }
|
| + return value;
|
| +}
|
| +
|
| +/*
|
| + * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
|
| + * If either byte is outside 21..7E make sure that the result is not valid
|
| + * for Shift-JIS so that the converter catches it.
|
| + * Some invalid byte values already turn into equally invalid Shift-JIS
|
| + * byte values and need not be tested explicitly.
|
| + */
|
| +static inline void
|
| +_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
|
| + if(c1&1) {
|
| + ++c1;
|
| + if(c2 <= 0x5f) {
|
| + c2 += 0x1f;
|
| + } else if(c2 <= 0x7e) {
|
| + c2 += 0x20;
|
| + } else {
|
| + c2 = 0; /* invalid */
|
| + }
|
| + } else {
|
| + if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
|
| + c2 += 0x7e;
|
| + } else {
|
| + c2 = 0; /* invalid */
|
| + }
|
| + }
|
| + c1 >>= 1;
|
| + if(c1 <= 0x2f) {
|
| + c1 += 0x70;
|
| + } else if(c1 <= 0x3f) {
|
| + c1 += 0xb0;
|
| + } else {
|
| + c1 = 0; /* invalid */
|
| + }
|
| + bytes[0] = (char)c1;
|
| + bytes[1] = (char)c2;
|
| +}
|
| +
|
| +/*
|
| + * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
|
| + * Katakana.
|
| + * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
|
| + * because Shift-JIS roundtrips half-width Katakana to single bytes.
|
| + * These were the only fallbacks in ICU's jisx-208.ucm file.
|
| + */
|
| +static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
|
| + 0x2123, /* U+FF61 */
|
| + 0x2156,
|
| + 0x2157,
|
| + 0x2122,
|
| + 0x2126,
|
| + 0x2572,
|
| + 0x2521,
|
| + 0x2523,
|
| + 0x2525,
|
| + 0x2527,
|
| + 0x2529,
|
| + 0x2563,
|
| + 0x2565,
|
| + 0x2567,
|
| + 0x2543,
|
| + 0x213C, /* U+FF70 */
|
| + 0x2522,
|
| + 0x2524,
|
| + 0x2526,
|
| + 0x2528,
|
| + 0x252A,
|
| + 0x252B,
|
| + 0x252D,
|
| + 0x252F,
|
| + 0x2531,
|
| + 0x2533,
|
| + 0x2535,
|
| + 0x2537,
|
| + 0x2539,
|
| + 0x253B,
|
| + 0x253D,
|
| + 0x253F, /* U+FF80 */
|
| + 0x2541,
|
| + 0x2544,
|
| + 0x2546,
|
| + 0x2548,
|
| + 0x254A,
|
| + 0x254B,
|
| + 0x254C,
|
| + 0x254D,
|
| + 0x254E,
|
| + 0x254F,
|
| + 0x2552,
|
| + 0x2555,
|
| + 0x2558,
|
| + 0x255B,
|
| + 0x255E,
|
| + 0x255F, /* U+FF90 */
|
| + 0x2560,
|
| + 0x2561,
|
| + 0x2562,
|
| + 0x2564,
|
| + 0x2566,
|
| + 0x2568,
|
| + 0x2569,
|
| + 0x256A,
|
| + 0x256B,
|
| + 0x256C,
|
| + 0x256D,
|
| + 0x256F,
|
| + 0x2573,
|
| + 0x212B,
|
| + 0x212C /* U+FF9F */
|
| +};
|
| +
|
| +static void
|
| +UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
|
| + UConverter *cnv = args->converter;
|
| + UConverterDataISO2022 *converterData;
|
| + ISO2022State *pFromU2022State;
|
| + uint8_t *target = (uint8_t *) args->target;
|
| + const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
|
| + const UChar* source = args->source;
|
| + const UChar* sourceLimit = args->sourceLimit;
|
| + int32_t* offsets = args->offsets;
|
| + UChar32 sourceChar;
|
| + char buffer[8];
|
| + int32_t len, outLen;
|
| + int8_t choices[10];
|
| + int32_t choiceCount;
|
| + uint32_t targetValue = 0;
|
| + UBool useFallback;
|
| +
|
| + int32_t i;
|
| + int8_t cs, g;
|
| +
|
| + /* set up the state */
|
| + converterData = (UConverterDataISO2022*)cnv->extraInfo;
|
| + pFromU2022State = &converterData->fromU2022State;
|
| +
|
| + choiceCount = 0;
|
| +
|
| + /* check if the last codepoint of previous buffer was a lead surrogate*/
|
| + if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
|
| + goto getTrail;
|
| + }
|
| +
|
| + while(source < sourceLimit) {
|
| + if(target < targetLimit) {
|
| +
|
| + sourceChar = *(source++);
|
| + /*check if the char is a First surrogate*/
|
| + if(U16_IS_SURROGATE(sourceChar)) {
|
| + if(U16_IS_SURROGATE_LEAD(sourceChar)) {
|
| +getTrail:
|
| + /*look ahead to find the trail surrogate*/
|
| + if(source < sourceLimit) {
|
| + /* test the following code unit */
|
| + UChar trail=(UChar) *source;
|
| + if(U16_IS_TRAIL(trail)) {
|
| + source++;
|
| + sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
|
| + cnv->fromUChar32=0x00;
|
| + /* convert this supplementary code point */
|
| + /* exit this condition tree */
|
| + } else {
|
| + /* this is an unmatched lead code unit (1st surrogate) */
|
| + /* callback(illegal) */
|
| + *err=U_ILLEGAL_CHAR_FOUND;
|
| + cnv->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| + } else {
|
| + /* no more input */
|
| + cnv->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| + } else {
|
| + /* this is an unmatched trail code unit (2nd surrogate) */
|
| + /* callback(illegal) */
|
| + *err=U_ILLEGAL_CHAR_FOUND;
|
| + cnv->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| + }
|
| +
|
| + /* do not convert SO/SI/ESC */
|
| + if(IS_2022_CONTROL(sourceChar)) {
|
| + /* callback(illegal) */
|
| + *err=U_ILLEGAL_CHAR_FOUND;
|
| + cnv->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| +
|
| + /* do the conversion */
|
| +
|
| + if(choiceCount == 0) {
|
| + uint16_t csm;
|
| +
|
| + /*
|
| + * The csm variable keeps track of which charsets are allowed
|
| + * and not used yet while building the choices[].
|
| + */
|
| + csm = jpCharsetMasks[converterData->version];
|
| + choiceCount = 0;
|
| +
|
| + /* JIS7/8: try single-byte half-width Katakana before JISX208 */
|
| + if(converterData->version == 3 || converterData->version == 4) {
|
| + choices[choiceCount++] = (int8_t)HWKANA_7BIT;
|
| + }
|
| + /* Do not try single-byte half-width Katakana for other versions. */
|
| + csm &= ~CSM(HWKANA_7BIT);
|
| +
|
| + /* try the current G0 charset */
|
| + choices[choiceCount++] = cs = pFromU2022State->cs[0];
|
| + csm &= ~CSM(cs);
|
| +
|
| + /* try the current G2 charset */
|
| + if((cs = pFromU2022State->cs[2]) != 0) {
|
| + choices[choiceCount++] = cs;
|
| + csm &= ~CSM(cs);
|
| + }
|
| +
|
| + /* try all the other possible charsets */
|
| + for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
|
| + cs = (int8_t)jpCharsetPref[i];
|
| + if(CSM(cs) & csm) {
|
| + choices[choiceCount++] = cs;
|
| + csm &= ~CSM(cs);
|
| + }
|
| + }
|
| + }
|
| +
|
| + cs = g = 0;
|
| + /*
|
| + * len==0: no mapping found yet
|
| + * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
|
| + * len>0: found a roundtrip result, done
|
| + */
|
| + len = 0;
|
| + /*
|
| + * We will turn off useFallback after finding a fallback,
|
| + * but we still get fallbacks from PUA code points as usual.
|
| + * Therefore, we will also need to check that we don't overwrite
|
| + * an early fallback with a later one.
|
| + */
|
| + useFallback = cnv->useFallback;
|
| +
|
| + for(i = 0; i < choiceCount && len <= 0; ++i) {
|
| + uint32_t value;
|
| + int32_t len2;
|
| + int8_t cs0 = choices[i];
|
| + switch(cs0) {
|
| + case ASCII:
|
| + if(sourceChar <= 0x7f) {
|
| + targetValue = (uint32_t)sourceChar;
|
| + len = 1;
|
| + cs = cs0;
|
| + g = 0;
|
| + }
|
| + break;
|
| + case ISO8859_1:
|
| + if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
|
| + targetValue = (uint32_t)sourceChar - 0x80;
|
| + len = 1;
|
| + cs = cs0;
|
| + g = 2;
|
| + }
|
| + break;
|
| + case HWKANA_7BIT:
|
| + if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
|
| + if(converterData->version==3) {
|
| + /* JIS7: use G1 (SO) */
|
| + /* Shift U+FF61..U+FF9F to bytes 21..5F. */
|
| + targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
|
| + len = 1;
|
| + pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
|
| + g = 1;
|
| + } else if(converterData->version==4) {
|
| + /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
|
| + /* Shift U+FF61..U+FF9F to bytes A1..DF. */
|
| + targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
|
| + len = 1;
|
| +
|
| + cs = pFromU2022State->cs[0];
|
| + if(IS_JP_DBCS(cs)) {
|
| + /* switch from a DBCS charset to JISX201 */
|
| + cs = (int8_t)JISX201;
|
| + }
|
| + /* else stay in the current G0 charset */
|
| + g = 0;
|
| + }
|
| + /* else do not use HWKANA_7BIT with other versions */
|
| + }
|
| + break;
|
| + case JISX201:
|
| + /* G0 SBCS */
|
| + value = jisx201FromU(sourceChar);
|
| + if(value <= 0x7f) {
|
| + targetValue = value;
|
| + len = 1;
|
| + cs = cs0;
|
| + g = 0;
|
| + useFallback = FALSE;
|
| + }
|
| + break;
|
| + case JISX208:
|
| + /* G0 DBCS from Shift-JIS table */
|
| + len2 = MBCS_FROM_UCHAR32_ISO2022(
|
| + converterData->myConverterArray[cs0],
|
| + sourceChar, &value,
|
| + useFallback, MBCS_OUTPUT_2);
|
| + if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
|
| + value = _2022FromSJIS(value);
|
| + if(value != 0) {
|
| + targetValue = value;
|
| + len = len2;
|
| + cs = cs0;
|
| + g = 0;
|
| + useFallback = FALSE;
|
| + }
|
| + } else if(len == 0 && useFallback &&
|
| + (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
|
| + targetValue = hwkana_fb[sourceChar - HWKANA_START];
|
| + len = -2;
|
| + cs = cs0;
|
| + g = 0;
|
| + useFallback = FALSE;
|
| + }
|
| + break;
|
| + case ISO8859_7:
|
| + /* G0 SBCS forced to 7-bit output */
|
| + len2 = MBCS_SINGLE_FROM_UCHAR32(
|
| + converterData->myConverterArray[cs0],
|
| + sourceChar, &value,
|
| + useFallback);
|
| + if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
|
| + targetValue = value - 0x80;
|
| + len = len2;
|
| + cs = cs0;
|
| + g = 2;
|
| + useFallback = FALSE;
|
| + }
|
| + break;
|
| + default:
|
| + /* G0 DBCS */
|
| + len2 = MBCS_FROM_UCHAR32_ISO2022(
|
| + converterData->myConverterArray[cs0],
|
| + sourceChar, &value,
|
| + useFallback, MBCS_OUTPUT_2);
|
| + if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
|
| + if(cs0 == KSC5601) {
|
| + /*
|
| + * Check for valid bytes for the encoding scheme.
|
| + * This is necessary because the sub-converter (windows-949)
|
| + * has a broader encoding scheme than is valid for 2022.
|
| + */
|
| + value = _2022FromGR94DBCS(value);
|
| + if(value == 0) {
|
| + break;
|
| + }
|
| + }
|
| + targetValue = value;
|
| + len = len2;
|
| + cs = cs0;
|
| + g = 0;
|
| + useFallback = FALSE;
|
| + }
|
| + break;
|
| + }
|
| + }
|
| +
|
| + if(len != 0) {
|
| + if(len < 0) {
|
| + len = -len; /* fallback */
|
| + }
|
| + outLen = 0; /* count output bytes */
|
| +
|
| + /* write SI if necessary (only for JIS7) */
|
| + if(pFromU2022State->g == 1 && g == 0) {
|
| + buffer[outLen++] = UCNV_SI;
|
| + pFromU2022State->g = 0;
|
| + }
|
| +
|
| + /* write the designation sequence if necessary */
|
| + if(cs != pFromU2022State->cs[g]) {
|
| + int32_t escLen = escSeqCharsLen[cs];
|
| + uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
|
| + outLen += escLen;
|
| + pFromU2022State->cs[g] = cs;
|
| +
|
| + /* invalidate the choices[] */
|
| + choiceCount = 0;
|
| + }
|
| +
|
| + /* write the shift sequence if necessary */
|
| + if(g != pFromU2022State->g) {
|
| + switch(g) {
|
| + /* case 0 handled before writing escapes */
|
| + case 1:
|
| + buffer[outLen++] = UCNV_SO;
|
| + pFromU2022State->g = 1;
|
| + break;
|
| + default: /* case 2 */
|
| + buffer[outLen++] = 0x1b;
|
| + buffer[outLen++] = 0x4e;
|
| + break;
|
| + /* no case 3: no SS3 in ISO-2022-JP-x */
|
| + }
|
| + }
|
| +
|
| + /* write the output bytes */
|
| + if(len == 1) {
|
| + buffer[outLen++] = (char)targetValue;
|
| + } else /* len == 2 */ {
|
| + buffer[outLen++] = (char)(targetValue >> 8);
|
| + buffer[outLen++] = (char)targetValue;
|
| + }
|
| + } else {
|
| + /*
|
| + * if we cannot find the character after checking all codepages
|
| + * then this is an error
|
| + */
|
| + *err = U_INVALID_CHAR_FOUND;
|
| + cnv->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| +
|
| + if(sourceChar == CR || sourceChar == LF) {
|
| + /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
|
| + pFromU2022State->cs[2] = 0;
|
| + choiceCount = 0;
|
| + }
|
| +
|
| + /* output outLen>0 bytes in buffer[] */
|
| + if(outLen == 1) {
|
| + *target++ = buffer[0];
|
| + if(offsets) {
|
| + *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
|
| + }
|
| + } else if(outLen == 2 && (target + 2) <= targetLimit) {
|
| + *target++ = buffer[0];
|
| + *target++ = buffer[1];
|
| + if(offsets) {
|
| + int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
|
| + *offsets++ = sourceIndex;
|
| + *offsets++ = sourceIndex;
|
| + }
|
| + } else {
|
| + fromUWriteUInt8(
|
| + cnv,
|
| + buffer, outLen,
|
| + &target, (const char *)targetLimit,
|
| + &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
|
| + err);
|
| + if(U_FAILURE(*err)) {
|
| + break;
|
| + }
|
| + }
|
| + } /* end if(myTargetIndex<myTargetLength) */
|
| + else{
|
| + *err =U_BUFFER_OVERFLOW_ERROR;
|
| + break;
|
| + }
|
| +
|
| + }/* end while(mySourceIndex<mySourceLength) */
|
| +
|
| + /*
|
| + * the end of the input stream and detection of truncated input
|
| + * are handled by the framework, but for ISO-2022-JP conversion
|
| + * we need to be in ASCII mode at the very end
|
| + *
|
| + * conditions:
|
| + * successful
|
| + * in SO mode or not in ASCII mode
|
| + * end of input and no truncated input
|
| + */
|
| + if( U_SUCCESS(*err) &&
|
| + (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
|
| + args->flush && source>=sourceLimit && cnv->fromUChar32==0
|
| + ) {
|
| + int32_t sourceIndex;
|
| +
|
| + outLen = 0;
|
| +
|
| + if(pFromU2022State->g != 0) {
|
| + buffer[outLen++] = UCNV_SI;
|
| + pFromU2022State->g = 0;
|
| + }
|
| +
|
| + if(pFromU2022State->cs[0] != ASCII) {
|
| + int32_t escLen = escSeqCharsLen[ASCII];
|
| + uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
|
| + outLen += escLen;
|
| + pFromU2022State->cs[0] = (int8_t)ASCII;
|
| + }
|
| +
|
| + /* get the source index of the last input character */
|
| + /*
|
| + * TODO this would be simpler and more reliable if we used a pair
|
| + * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
|
| + * so that we could simply use the prevSourceIndex here;
|
| + * this code gives an incorrect result for the rare case of an unmatched
|
| + * trail surrogate that is alone in the last buffer of the text stream
|
| + */
|
| + sourceIndex=(int32_t)(source-args->source);
|
| + if(sourceIndex>0) {
|
| + --sourceIndex;
|
| + if( U16_IS_TRAIL(args->source[sourceIndex]) &&
|
| + (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
|
| + ) {
|
| + --sourceIndex;
|
| + }
|
| + } else {
|
| + sourceIndex=-1;
|
| + }
|
| +
|
| + fromUWriteUInt8(
|
| + cnv,
|
| + buffer, outLen,
|
| + &target, (const char *)targetLimit,
|
| + &offsets, sourceIndex,
|
| + err);
|
| + }
|
| +
|
| + /*save the state and return */
|
| + args->source = source;
|
| + args->target = (char*)target;
|
| +}
|
| +
|
| +/*************** to unicode *******************/
|
| +
|
| +static void
|
| +UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
| + UErrorCode* err){
|
| + char tempBuf[2];
|
| + const char *mySource = (char *) args->source;
|
| + UChar *myTarget = args->target;
|
| + const char *mySourceLimit = args->sourceLimit;
|
| + uint32_t targetUniChar = 0x0000;
|
| + uint32_t mySourceChar = 0x0000;
|
| + uint32_t tmpSourceChar = 0x0000;
|
| + UConverterDataISO2022* myData;
|
| + ISO2022State *pToU2022State;
|
| + StateEnum cs;
|
| +
|
| + myData=(UConverterDataISO2022*)(args->converter->extraInfo);
|
| + pToU2022State = &myData->toU2022State;
|
| +
|
| + if(myData->key != 0) {
|
| + /* continue with a partial escape sequence */
|
| + goto escape;
|
| + } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
|
| + /* continue with a partial double-byte character */
|
| + mySourceChar = args->converter->toUBytes[0];
|
| + args->converter->toULength = 0;
|
| + cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
|
| + targetUniChar = missingCharMarker;
|
| + goto getTrailByte;
|
| + }
|
| +
|
| + while(mySource < mySourceLimit){
|
| +
|
| + targetUniChar =missingCharMarker;
|
| +
|
| + if(myTarget < args->targetLimit){
|
| +
|
| + mySourceChar= (unsigned char) *mySource++;
|
| +
|
| + switch(mySourceChar) {
|
| + case UCNV_SI:
|
| + if(myData->version==3) {
|
| + pToU2022State->g=0;
|
| + continue;
|
| + } else {
|
| + /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
|
| + myData->isEmptySegment = FALSE; /* reset this, we have a different error */
|
| + break;
|
| + }
|
| +
|
| + case UCNV_SO:
|
| + if(myData->version==3) {
|
| + /* JIS7: switch to G1 half-width Katakana */
|
| + pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
|
| + pToU2022State->g=1;
|
| + continue;
|
| + } else {
|
| + /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
|
| + myData->isEmptySegment = FALSE; /* reset this, we have a different error */
|
| + break;
|
| + }
|
| +
|
| + case ESC_2022:
|
| + mySource--;
|
| +escape:
|
| + {
|
| + const char * mySourceBefore = mySource;
|
| + int8_t toULengthBefore = args->converter->toULength;
|
| +
|
| + changeState_2022(args->converter,&(mySource),
|
| + mySourceLimit, ISO_2022_JP,err);
|
| +
|
| + /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
|
| + if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| + args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
|
| + }
|
| + }
|
| +
|
| + /* invalid or illegal escape sequence */
|
| + if(U_FAILURE(*err)){
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| + myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
|
| + return;
|
| + }
|
| + /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
|
| + if(myData->key==0) {
|
| + myData->isEmptySegment = TRUE;
|
| + }
|
| + continue;
|
| +
|
| + /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
|
| +
|
| + case CR:
|
| + /*falls through*/
|
| + case LF:
|
| + /* automatically reset to single-byte mode */
|
| + if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
|
| + pToU2022State->cs[0] = (int8_t)ASCII;
|
| + }
|
| + pToU2022State->cs[2] = 0;
|
| + pToU2022State->g = 0;
|
| + /* falls through */
|
| + default:
|
| + /* convert one or two bytes */
|
| + myData->isEmptySegment = FALSE;
|
| + cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
|
| + if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
|
| + !IS_JP_DBCS(cs)
|
| + ) {
|
| + /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
|
| + targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
|
| +
|
| + /* return from a single-shift state to the previous one */
|
| + if(pToU2022State->g >= 2) {
|
| + pToU2022State->g=pToU2022State->prevG;
|
| + }
|
| + } else switch(cs) {
|
| + case ASCII:
|
| + if(mySourceChar <= 0x7f) {
|
| + targetUniChar = mySourceChar;
|
| + }
|
| + break;
|
| + case ISO8859_1:
|
| + if(mySourceChar <= 0x7f) {
|
| + targetUniChar = mySourceChar + 0x80;
|
| + }
|
| + /* return from a single-shift state to the previous one */
|
| + pToU2022State->g=pToU2022State->prevG;
|
| + break;
|
| + case ISO8859_7:
|
| + if(mySourceChar <= 0x7f) {
|
| + /* convert mySourceChar+0x80 to use a normal 8-bit table */
|
| + targetUniChar =
|
| + _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
|
| + myData->myConverterArray[cs],
|
| + mySourceChar + 0x80);
|
| + }
|
| + /* return from a single-shift state to the previous one */
|
| + pToU2022State->g=pToU2022State->prevG;
|
| + break;
|
| + case JISX201:
|
| + if(mySourceChar <= 0x7f) {
|
| + targetUniChar = jisx201ToU(mySourceChar);
|
| + }
|
| + break;
|
| + case HWKANA_7BIT:
|
| + if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
|
| + /* 7-bit halfwidth Katakana */
|
| + targetUniChar = mySourceChar + (HWKANA_START - 0x21);
|
| + }
|
| + break;
|
| + default:
|
| + /* G0 DBCS */
|
| + if(mySource < mySourceLimit) {
|
| + int leadIsOk, trailIsOk;
|
| + uint8_t trailByte;
|
| +getTrailByte:
|
| + trailByte = (uint8_t)*mySource;
|
| + /*
|
| + * Ticket 5691: consistent illegal sequences:
|
| + * - We include at least the first byte in the illegal sequence.
|
| + * - If any of the non-initial bytes could be the start of a character,
|
| + * we stop the illegal sequence before the first one of those.
|
| + *
|
| + * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
|
| + * an ESC/SO/SI, we report only the first byte as the illegal sequence.
|
| + * Otherwise we convert or report the pair of bytes.
|
| + */
|
| + leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
|
| + trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
|
| + if (leadIsOk && trailIsOk) {
|
| + ++mySource;
|
| + tmpSourceChar = (mySourceChar << 8) | trailByte;
|
| + if(cs == JISX208) {
|
| + _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
|
| + mySourceChar = tmpSourceChar;
|
| + } else {
|
| + /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
|
| + mySourceChar = tmpSourceChar;
|
| + if (cs == KSC5601) {
|
| + tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
|
| + }
|
| + tempBuf[0] = (char)(tmpSourceChar >> 8);
|
| + tempBuf[1] = (char)(tmpSourceChar);
|
| + }
|
| + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
|
| + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
|
| + /* report a pair of illegal bytes if the second byte is not a DBCS starter */
|
| + ++mySource;
|
| + /* add another bit so that the code below writes 2 bytes in case of error */
|
| + mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
|
| + }
|
| + } else {
|
| + args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
| + args->converter->toULength = 1;
|
| + goto endloop;
|
| + }
|
| + } /* End of inner switch */
|
| + break;
|
| + } /* End of outer switch */
|
| + if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
|
| + if(args->offsets){
|
| + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
|
| + }
|
| + *(myTarget++)=(UChar)targetUniChar;
|
| + }
|
| + else if(targetUniChar > missingCharMarker){
|
| + /* disassemble the surrogate pair and write to output*/
|
| + targetUniChar-=0x0010000;
|
| + *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
|
| + if(args->offsets){
|
| + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
|
| + }
|
| + ++myTarget;
|
| + if(myTarget< args->targetLimit){
|
| + *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
|
| + if(args->offsets){
|
| + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
|
| + }
|
| + ++myTarget;
|
| + }else{
|
| + args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
|
| + (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
|
| + }
|
| +
|
| + }
|
| + else{
|
| + /* Call the callback function*/
|
| + toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
|
| + break;
|
| + }
|
| + }
|
| + else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
|
| + *err =U_BUFFER_OVERFLOW_ERROR;
|
| + break;
|
| + }
|
| + }
|
| +endloop:
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| +}
|
| +
|
| +
|
| +/***************************************************************
|
| +* Rules for ISO-2022-KR encoding
|
| +* i) The KSC5601 designator sequence should appear only once in a file,
|
| +* at the begining of a line before any KSC5601 characters. This usually
|
| +* means that it appears by itself on the first line of the file
|
| +* ii) There are only 2 shifting sequences SO to shift into double byte mode
|
| +* and SI to shift into single byte mode
|
| +*/
|
| +static void
|
| +UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
|
| +
|
| + UConverter* saveConv = args->converter;
|
| + UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
|
| + args->converter=myConverterData->currentConverter;
|
| +
|
| + myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
|
| + ucnv_MBCSFromUnicodeWithOffsets(args,err);
|
| + saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
|
| +
|
| + if(*err == U_BUFFER_OVERFLOW_ERROR) {
|
| + if(myConverterData->currentConverter->charErrorBufferLength > 0) {
|
| + uprv_memcpy(
|
| + saveConv->charErrorBuffer,
|
| + myConverterData->currentConverter->charErrorBuffer,
|
| + myConverterData->currentConverter->charErrorBufferLength);
|
| + }
|
| + saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
|
| + myConverterData->currentConverter->charErrorBufferLength = 0;
|
| + }
|
| + args->converter=saveConv;
|
| +}
|
| +
|
| +static void
|
| +UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
|
| +
|
| + const UChar *source = args->source;
|
| + const UChar *sourceLimit = args->sourceLimit;
|
| + unsigned char *target = (unsigned char *) args->target;
|
| + unsigned char *targetLimit = (unsigned char *) args->targetLimit;
|
| + int32_t* offsets = args->offsets;
|
| + uint32_t targetByteUnit = 0x0000;
|
| + UChar32 sourceChar = 0x0000;
|
| + UBool isTargetByteDBCS;
|
| + UBool oldIsTargetByteDBCS;
|
| + UConverterDataISO2022 *converterData;
|
| + UConverterSharedData* sharedData;
|
| + UBool useFallback;
|
| + int32_t length =0;
|
| +
|
| + converterData=(UConverterDataISO2022*)args->converter->extraInfo;
|
| + /* if the version is 1 then the user is requesting
|
| + * conversion with ibm-25546 pass the arguments to
|
| + * MBCS converter and return
|
| + */
|
| + if(converterData->version==1){
|
| + UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
|
| + return;
|
| + }
|
| +
|
| + /* initialize data */
|
| + sharedData = converterData->currentConverter->sharedData;
|
| + useFallback = args->converter->useFallback;
|
| + isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
|
| + oldIsTargetByteDBCS = isTargetByteDBCS;
|
| +
|
| + isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
|
| + if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
|
| + goto getTrail;
|
| + }
|
| + while(source < sourceLimit){
|
| +
|
| + targetByteUnit = missingCharMarker;
|
| +
|
| + if(target < (unsigned char*) args->targetLimit){
|
| + sourceChar = *source++;
|
| +
|
| + /* do not convert SO/SI/ESC */
|
| + if(IS_2022_CONTROL(sourceChar)) {
|
| + /* callback(illegal) */
|
| + *err=U_ILLEGAL_CHAR_FOUND;
|
| + args->converter->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| +
|
| + length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
|
| + if(length < 0) {
|
| + length = -length; /* fallback */
|
| + }
|
| + /* only DBCS or SBCS characters are expected*/
|
| + /* DB characters with high bit set to 1 are expected */
|
| + if( length > 2 || length==0 ||
|
| + (length == 1 && targetByteUnit > 0x7f) ||
|
| + (length == 2 &&
|
| + ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
|
| + (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
|
| + ) {
|
| + targetByteUnit=missingCharMarker;
|
| + }
|
| + if (targetByteUnit != missingCharMarker){
|
| +
|
| + oldIsTargetByteDBCS = isTargetByteDBCS;
|
| + isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
|
| + /* append the shift sequence */
|
| + if (oldIsTargetByteDBCS != isTargetByteDBCS ){
|
| +
|
| + if (isTargetByteDBCS)
|
| + *target++ = UCNV_SO;
|
| + else
|
| + *target++ = UCNV_SI;
|
| + if(offsets)
|
| + *(offsets++) = (int32_t)(source - args->source-1);
|
| + }
|
| + /* write the targetUniChar to target */
|
| + if(targetByteUnit <= 0x00FF){
|
| + if( target < targetLimit){
|
| + *(target++) = (unsigned char) targetByteUnit;
|
| + if(offsets){
|
| + *(offsets++) = (int32_t)(source - args->source-1);
|
| + }
|
| +
|
| + }else{
|
| + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
|
| + *err = U_BUFFER_OVERFLOW_ERROR;
|
| + }
|
| + }else{
|
| + if(target < targetLimit){
|
| + *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
|
| + if(offsets){
|
| + *(offsets++) = (int32_t)(source - args->source-1);
|
| + }
|
| + if(target < targetLimit){
|
| + *(target++) =(unsigned char) (targetByteUnit -0x80);
|
| + if(offsets){
|
| + *(offsets++) = (int32_t)(source - args->source-1);
|
| + }
|
| + }else{
|
| + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
|
| + *err = U_BUFFER_OVERFLOW_ERROR;
|
| + }
|
| + }else{
|
| + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
|
| + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
|
| + *err = U_BUFFER_OVERFLOW_ERROR;
|
| + }
|
| + }
|
| +
|
| + }
|
| + else{
|
| + /* oops.. the code point is unassingned
|
| + * set the error and reason
|
| + */
|
| +
|
| + /*check if the char is a First surrogate*/
|
| + if(U16_IS_SURROGATE(sourceChar)) {
|
| + if(U16_IS_SURROGATE_LEAD(sourceChar)) {
|
| +getTrail:
|
| + /*look ahead to find the trail surrogate*/
|
| + if(source < sourceLimit) {
|
| + /* test the following code unit */
|
| + UChar trail=(UChar) *source;
|
| + if(U16_IS_TRAIL(trail)) {
|
| + source++;
|
| + sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
|
| + *err = U_INVALID_CHAR_FOUND;
|
| + /* convert this surrogate code point */
|
| + /* exit this condition tree */
|
| + } else {
|
| + /* this is an unmatched lead code unit (1st surrogate) */
|
| + /* callback(illegal) */
|
| + *err=U_ILLEGAL_CHAR_FOUND;
|
| + }
|
| + } else {
|
| + /* no more input */
|
| + *err = U_ZERO_ERROR;
|
| + }
|
| + } else {
|
| + /* this is an unmatched trail code unit (2nd surrogate) */
|
| + /* callback(illegal) */
|
| + *err=U_ILLEGAL_CHAR_FOUND;
|
| + }
|
| + } else {
|
| + /* callback(unassigned) for a BMP code point */
|
| + *err = U_INVALID_CHAR_FOUND;
|
| + }
|
| +
|
| + args->converter->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| + } /* end if(myTargetIndex<myTargetLength) */
|
| + else{
|
| + *err =U_BUFFER_OVERFLOW_ERROR;
|
| + break;
|
| + }
|
| +
|
| + }/* end while(mySourceIndex<mySourceLength) */
|
| +
|
| + /*
|
| + * the end of the input stream and detection of truncated input
|
| + * are handled by the framework, but for ISO-2022-KR conversion
|
| + * we need to be in ASCII mode at the very end
|
| + *
|
| + * conditions:
|
| + * successful
|
| + * not in ASCII mode
|
| + * end of input and no truncated input
|
| + */
|
| + if( U_SUCCESS(*err) &&
|
| + isTargetByteDBCS &&
|
| + args->flush && source>=sourceLimit && args->converter->fromUChar32==0
|
| + ) {
|
| + int32_t sourceIndex;
|
| +
|
| + /* we are switching to ASCII */
|
| + isTargetByteDBCS=FALSE;
|
| +
|
| + /* get the source index of the last input character */
|
| + /*
|
| + * TODO this would be simpler and more reliable if we used a pair
|
| + * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
|
| + * so that we could simply use the prevSourceIndex here;
|
| + * this code gives an incorrect result for the rare case of an unmatched
|
| + * trail surrogate that is alone in the last buffer of the text stream
|
| + */
|
| + sourceIndex=(int32_t)(source-args->source);
|
| + if(sourceIndex>0) {
|
| + --sourceIndex;
|
| + if( U16_IS_TRAIL(args->source[sourceIndex]) &&
|
| + (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
|
| + ) {
|
| + --sourceIndex;
|
| + }
|
| + } else {
|
| + sourceIndex=-1;
|
| + }
|
| +
|
| + fromUWriteUInt8(
|
| + args->converter,
|
| + SHIFT_IN_STR, 1,
|
| + &target, (const char *)targetLimit,
|
| + &offsets, sourceIndex,
|
| + err);
|
| + }
|
| +
|
| + /*save the state and return */
|
| + args->source = source;
|
| + args->target = (char*)target;
|
| + args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
|
| +}
|
| +
|
| +/************************ To Unicode ***************************************/
|
| +
|
| +static void
|
| +UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
|
| + UErrorCode* err){
|
| + char const* sourceStart;
|
| + UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
|
| +
|
| + UConverterToUnicodeArgs subArgs;
|
| + int32_t minArgsSize;
|
| +
|
| + /* set up the subconverter arguments */
|
| + if(args->size<sizeof(UConverterToUnicodeArgs)) {
|
| + minArgsSize = args->size;
|
| + } else {
|
| + minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
|
| + }
|
| +
|
| + uprv_memcpy(&subArgs, args, minArgsSize);
|
| + subArgs.size = (uint16_t)minArgsSize;
|
| + subArgs.converter = myData->currentConverter;
|
| +
|
| + /* remember the original start of the input for offsets */
|
| + sourceStart = args->source;
|
| +
|
| + if(myData->key != 0) {
|
| + /* continue with a partial escape sequence */
|
| + goto escape;
|
| + }
|
| +
|
| + while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
|
| + /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
|
| + subArgs.source = args->source;
|
| + subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
|
| + if(subArgs.source != subArgs.sourceLimit) {
|
| + /*
|
| + * get the current partial byte sequence
|
| + *
|
| + * it needs to be moved between the public and the subconverter
|
| + * so that the conversion framework, which only sees the public
|
| + * converter, can handle truncated and illegal input etc.
|
| + */
|
| + if(args->converter->toULength > 0) {
|
| + uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
|
| + }
|
| + subArgs.converter->toULength = args->converter->toULength;
|
| +
|
| + /*
|
| + * Convert up to the end of the input, or to before the next escape character.
|
| + * Does not handle conversion extensions because the preToU[] state etc.
|
| + * is not copied.
|
| + */
|
| + ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
|
| +
|
| + if(args->offsets != NULL && sourceStart != args->source) {
|
| + /* update offsets to base them on the actual start of the input */
|
| + int32_t *offsets = args->offsets;
|
| + UChar *target = args->target;
|
| + int32_t delta = (int32_t)(args->source - sourceStart);
|
| + while(target < subArgs.target) {
|
| + if(*offsets >= 0) {
|
| + *offsets += delta;
|
| + }
|
| + ++offsets;
|
| + ++target;
|
| + }
|
| + }
|
| + args->source = subArgs.source;
|
| + args->target = subArgs.target;
|
| + args->offsets = subArgs.offsets;
|
| +
|
| + /* copy input/error/overflow buffers */
|
| + if(subArgs.converter->toULength > 0) {
|
| + uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
|
| + }
|
| + args->converter->toULength = subArgs.converter->toULength;
|
| +
|
| + if(*err == U_BUFFER_OVERFLOW_ERROR) {
|
| + if(subArgs.converter->UCharErrorBufferLength > 0) {
|
| + uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
|
| + subArgs.converter->UCharErrorBufferLength);
|
| + }
|
| + args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
|
| + subArgs.converter->UCharErrorBufferLength = 0;
|
| + }
|
| + }
|
| +
|
| + if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
|
| + return;
|
| + }
|
| +
|
| +escape:
|
| + changeState_2022(args->converter,
|
| + &(args->source),
|
| + args->sourceLimit,
|
| + ISO_2022_KR,
|
| + err);
|
| + }
|
| +}
|
| +
|
| +static void
|
| +UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
| + UErrorCode* err){
|
| + char tempBuf[2];
|
| + const char *mySource = ( char *) args->source;
|
| + UChar *myTarget = args->target;
|
| + const char *mySourceLimit = args->sourceLimit;
|
| + UChar32 targetUniChar = 0x0000;
|
| + UChar mySourceChar = 0x0000;
|
| + UConverterDataISO2022* myData;
|
| + UConverterSharedData* sharedData ;
|
| + UBool useFallback;
|
| +
|
| + myData=(UConverterDataISO2022*)(args->converter->extraInfo);
|
| + if(myData->version==1){
|
| + UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
|
| + return;
|
| + }
|
| +
|
| + /* initialize state */
|
| + sharedData = myData->currentConverter->sharedData;
|
| + useFallback = args->converter->useFallback;
|
| +
|
| + if(myData->key != 0) {
|
| + /* continue with a partial escape sequence */
|
| + goto escape;
|
| + } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
|
| + /* continue with a partial double-byte character */
|
| + mySourceChar = args->converter->toUBytes[0];
|
| + args->converter->toULength = 0;
|
| + goto getTrailByte;
|
| + }
|
| +
|
| + while(mySource< mySourceLimit){
|
| +
|
| + if(myTarget < args->targetLimit){
|
| +
|
| + mySourceChar= (unsigned char) *mySource++;
|
| +
|
| + if(mySourceChar==UCNV_SI){
|
| + myData->toU2022State.g = 0;
|
| + if (myData->isEmptySegment) {
|
| + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| + args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
| + args->converter->toULength = 1;
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| + return;
|
| + }
|
| + /*consume the source */
|
| + continue;
|
| + }else if(mySourceChar==UCNV_SO){
|
| + myData->toU2022State.g = 1;
|
| + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
|
| + /*consume the source */
|
| + continue;
|
| + }else if(mySourceChar==ESC_2022){
|
| + mySource--;
|
| +escape:
|
| + myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
|
| + changeState_2022(args->converter,&(mySource),
|
| + mySourceLimit, ISO_2022_KR, err);
|
| + if(U_FAILURE(*err)){
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| + return;
|
| + }
|
| + continue;
|
| + }
|
| +
|
| + myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
|
| + if(myData->toU2022State.g == 1) {
|
| + if(mySource < mySourceLimit) {
|
| + int leadIsOk, trailIsOk;
|
| + uint8_t trailByte;
|
| +getTrailByte:
|
| + targetUniChar = missingCharMarker;
|
| + trailByte = (uint8_t)*mySource;
|
| + /*
|
| + * Ticket 5691: consistent illegal sequences:
|
| + * - We include at least the first byte in the illegal sequence.
|
| + * - If any of the non-initial bytes could be the start of a character,
|
| + * we stop the illegal sequence before the first one of those.
|
| + *
|
| + * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
|
| + * an ESC/SO/SI, we report only the first byte as the illegal sequence.
|
| + * Otherwise we convert or report the pair of bytes.
|
| + */
|
| + leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
|
| + trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
|
| + if (leadIsOk && trailIsOk) {
|
| + ++mySource;
|
| + tempBuf[0] = (char)(mySourceChar + 0x80);
|
| + tempBuf[1] = (char)(trailByte + 0x80);
|
| + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
|
| + mySourceChar = (mySourceChar << 8) | trailByte;
|
| + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
|
| + /* report a pair of illegal bytes if the second byte is not a DBCS starter */
|
| + ++mySource;
|
| + /* add another bit so that the code below writes 2 bytes in case of error */
|
| + mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
|
| + }
|
| + } else {
|
| + args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
| + args->converter->toULength = 1;
|
| + break;
|
| + }
|
| + }
|
| + else if(mySourceChar <= 0x7f) {
|
| + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
|
| + } else {
|
| + targetUniChar = 0xffff;
|
| + }
|
| + if(targetUniChar < 0xfffe){
|
| + if(args->offsets) {
|
| + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
|
| + }
|
| + *(myTarget++)=(UChar)targetUniChar;
|
| + }
|
| + else {
|
| + /* Call the callback function*/
|
| + toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
|
| + break;
|
| + }
|
| + }
|
| + else{
|
| + *err =U_BUFFER_OVERFLOW_ERROR;
|
| + break;
|
| + }
|
| + }
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| +}
|
| +
|
| +/*************************** END ISO2022-KR *********************************/
|
| +
|
| +/*************************** ISO-2022-CN *********************************
|
| +*
|
| +* Rules for ISO-2022-CN Encoding:
|
| +* i) The designator sequence must appear once on a line before any instance
|
| +* of character set it designates.
|
| +* ii) If two lines contain characters from the same character set, both lines
|
| +* must include the designator sequence.
|
| +* iii) Once the designator sequence is known, a shifting sequence has to be found
|
| +* to invoke the shifting
|
| +* iv) All lines start in ASCII and end in ASCII.
|
| +* v) Four shifting sequences are employed for this purpose:
|
| +*
|
| +* Sequcence ASCII Eq Charsets
|
| +* ---------- ------- ---------
|
| +* SI <SI> US-ASCII
|
| +* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
|
| +* SS2 <ESC>N CNS-11643-1992 Plane 2
|
| +* SS3 <ESC>O CNS-11643-1992 Planes 3-7
|
| +*
|
| +* vi)
|
| +* SOdesignator : ESC "$" ")" finalchar_for_SO
|
| +* SS2designator : ESC "$" "*" finalchar_for_SS2
|
| +* SS3designator : ESC "$" "+" finalchar_for_SS3
|
| +*
|
| +* ESC $ ) A Indicates the bytes following SO are Chinese
|
| +* characters as defined in GB 2312-80, until
|
| +* another SOdesignation appears
|
| +*
|
| +*
|
| +* ESC $ ) E Indicates the bytes following SO are as defined
|
| +* in ISO-IR-165 (for details, see section 2.1),
|
| +* until another SOdesignation appears
|
| +*
|
| +* ESC $ ) G Indicates the bytes following SO are as defined
|
| +* in CNS 11643-plane-1, until another
|
| +* SOdesignation appears
|
| +*
|
| +* ESC $ * H Indicates the two bytes immediately following
|
| +* SS2 is a Chinese character as defined in CNS
|
| +* 11643-plane-2, until another SS2designation
|
| +* appears
|
| +* (Meaning <ESC>N must preceed every 2 byte
|
| +* sequence.)
|
| +*
|
| +* ESC $ + I Indicates the immediate two bytes following SS3
|
| +* is a Chinese character as defined in CNS
|
| +* 11643-plane-3, until another SS3designation
|
| +* appears
|
| +* (Meaning <ESC>O must preceed every 2 byte
|
| +* sequence.)
|
| +*
|
| +* ESC $ + J Indicates the immediate two bytes following SS3
|
| +* is a Chinese character as defined in CNS
|
| +* 11643-plane-4, until another SS3designation
|
| +* appears
|
| +* (In English: <ESC>O must preceed every 2 byte
|
| +* sequence.)
|
| +*
|
| +* ESC $ + K Indicates the immediate two bytes following SS3
|
| +* is a Chinese character as defined in CNS
|
| +* 11643-plane-5, until another SS3designation
|
| +* appears
|
| +*
|
| +* ESC $ + L Indicates the immediate two bytes following SS3
|
| +* is a Chinese character as defined in CNS
|
| +* 11643-plane-6, until another SS3designation
|
| +* appears
|
| +*
|
| +* ESC $ + M Indicates the immediate two bytes following SS3
|
| +* is a Chinese character as defined in CNS
|
| +* 11643-plane-7, until another SS3designation
|
| +* appears
|
| +*
|
| +* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
|
| +* has its own designation information before any Chinese characters
|
| +* appear
|
| +*
|
| +*/
|
| +
|
| +/* The following are defined this way to make the strings truly readonly */
|
| +static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
|
| +static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
|
| +static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
|
| +static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
|
| +static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
|
| +static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
|
| +static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
|
| +static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
|
| +static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
|
| +
|
| +/********************** ISO2022-CN Data **************************/
|
| +static const char* const escSeqCharsCN[10] ={
|
| + SHIFT_IN_STR, /* 0 ASCII */
|
| + GB_2312_80_STR, /* 1 GB2312_1 */
|
| + ISO_IR_165_STR, /* 2 ISO_IR_165 */
|
| + CNS_11643_1992_Plane_1_STR,
|
| + CNS_11643_1992_Plane_2_STR,
|
| + CNS_11643_1992_Plane_3_STR,
|
| + CNS_11643_1992_Plane_4_STR,
|
| + CNS_11643_1992_Plane_5_STR,
|
| + CNS_11643_1992_Plane_6_STR,
|
| + CNS_11643_1992_Plane_7_STR
|
| +};
|
| +
|
| +static void
|
| +UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
|
| + UConverter *cnv = args->converter;
|
| + UConverterDataISO2022 *converterData;
|
| + ISO2022State *pFromU2022State;
|
| + uint8_t *target = (uint8_t *) args->target;
|
| + const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
|
| + const UChar* source = args->source;
|
| + const UChar* sourceLimit = args->sourceLimit;
|
| + int32_t* offsets = args->offsets;
|
| + UChar32 sourceChar;
|
| + char buffer[8];
|
| + int32_t len;
|
| + int8_t choices[3];
|
| + int32_t choiceCount;
|
| + uint32_t targetValue = 0;
|
| + UBool useFallback;
|
| +
|
| + /* set up the state */
|
| + converterData = (UConverterDataISO2022*)cnv->extraInfo;
|
| + pFromU2022State = &converterData->fromU2022State;
|
| +
|
| + choiceCount = 0;
|
| +
|
| + /* check if the last codepoint of previous buffer was a lead surrogate*/
|
| + if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
|
| + goto getTrail;
|
| + }
|
| +
|
| + while( source < sourceLimit){
|
| + if(target < targetLimit){
|
| +
|
| + sourceChar = *(source++);
|
| + /*check if the char is a First surrogate*/
|
| + if(U16_IS_SURROGATE(sourceChar)) {
|
| + if(U16_IS_SURROGATE_LEAD(sourceChar)) {
|
| +getTrail:
|
| + /*look ahead to find the trail surrogate*/
|
| + if(source < sourceLimit) {
|
| + /* test the following code unit */
|
| + UChar trail=(UChar) *source;
|
| + if(U16_IS_TRAIL(trail)) {
|
| + source++;
|
| + sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
|
| + cnv->fromUChar32=0x00;
|
| + /* convert this supplementary code point */
|
| + /* exit this condition tree */
|
| + } else {
|
| + /* this is an unmatched lead code unit (1st surrogate) */
|
| + /* callback(illegal) */
|
| + *err=U_ILLEGAL_CHAR_FOUND;
|
| + cnv->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| + } else {
|
| + /* no more input */
|
| + cnv->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| + } else {
|
| + /* this is an unmatched trail code unit (2nd surrogate) */
|
| + /* callback(illegal) */
|
| + *err=U_ILLEGAL_CHAR_FOUND;
|
| + cnv->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| + }
|
| +
|
| + /* do the conversion */
|
| + if(sourceChar <= 0x007f ){
|
| + /* do not convert SO/SI/ESC */
|
| + if(IS_2022_CONTROL(sourceChar)) {
|
| + /* callback(illegal) */
|
| + *err=U_ILLEGAL_CHAR_FOUND;
|
| + cnv->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| +
|
| + /* US-ASCII */
|
| + if(pFromU2022State->g == 0) {
|
| + buffer[0] = (char)sourceChar;
|
| + len = 1;
|
| + } else {
|
| + buffer[0] = UCNV_SI;
|
| + buffer[1] = (char)sourceChar;
|
| + len = 2;
|
| + pFromU2022State->g = 0;
|
| + choiceCount = 0;
|
| + }
|
| + if(sourceChar == CR || sourceChar == LF) {
|
| + /* reset the state at the end of a line */
|
| + uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
|
| + choiceCount = 0;
|
| + }
|
| + }
|
| + else{
|
| + /* convert U+0080..U+10ffff */
|
| + int32_t i;
|
| + int8_t cs, g;
|
| +
|
| + if(choiceCount == 0) {
|
| + /* try the current SO/G1 converter first */
|
| + choices[0] = pFromU2022State->cs[1];
|
| +
|
| + /* default to GB2312_1 if none is designated yet */
|
| + if(choices[0] == 0) {
|
| + choices[0] = GB2312_1;
|
| + }
|
| +
|
| + if(converterData->version == 0) {
|
| + /* ISO-2022-CN */
|
| +
|
| + /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
|
| + if(choices[0] == GB2312_1) {
|
| + choices[1] = (int8_t)CNS_11643_1;
|
| + } else {
|
| + choices[1] = (int8_t)GB2312_1;
|
| + }
|
| +
|
| + choiceCount = 2;
|
| + } else if (converterData->version == 1) {
|
| + /* ISO-2022-CN-EXT */
|
| +
|
| + /* try one of the other converters */
|
| + switch(choices[0]) {
|
| + case GB2312_1:
|
| + choices[1] = (int8_t)CNS_11643_1;
|
| + choices[2] = (int8_t)ISO_IR_165;
|
| + break;
|
| + case ISO_IR_165:
|
| + choices[1] = (int8_t)GB2312_1;
|
| + choices[2] = (int8_t)CNS_11643_1;
|
| + break;
|
| + default: /* CNS_11643_x */
|
| + choices[1] = (int8_t)GB2312_1;
|
| + choices[2] = (int8_t)ISO_IR_165;
|
| + break;
|
| + }
|
| +
|
| + choiceCount = 3;
|
| + } else {
|
| + choices[0] = (int8_t)CNS_11643_1;
|
| + choices[1] = (int8_t)GB2312_1;
|
| + }
|
| + }
|
| +
|
| + cs = g = 0;
|
| + /*
|
| + * len==0: no mapping found yet
|
| + * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
|
| + * len>0: found a roundtrip result, done
|
| + */
|
| + len = 0;
|
| + /*
|
| + * We will turn off useFallback after finding a fallback,
|
| + * but we still get fallbacks from PUA code points as usual.
|
| + * Therefore, we will also need to check that we don't overwrite
|
| + * an early fallback with a later one.
|
| + */
|
| + useFallback = cnv->useFallback;
|
| +
|
| + for(i = 0; i < choiceCount && len <= 0; ++i) {
|
| + int8_t cs0 = choices[i];
|
| + if(cs0 > 0) {
|
| + uint32_t value;
|
| + int32_t len2;
|
| + if(cs0 >= CNS_11643_0) {
|
| + len2 = MBCS_FROM_UCHAR32_ISO2022(
|
| + converterData->myConverterArray[CNS_11643],
|
| + sourceChar,
|
| + &value,
|
| + useFallback,
|
| + MBCS_OUTPUT_3);
|
| + if(len2 == 3 || (len2 == -3 && len == 0)) {
|
| + targetValue = value;
|
| + cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
|
| + if(len2 >= 0) {
|
| + len = 2;
|
| + } else {
|
| + len = -2;
|
| + useFallback = FALSE;
|
| + }
|
| + if(cs == CNS_11643_1) {
|
| + g = 1;
|
| + } else if(cs == CNS_11643_2) {
|
| + g = 2;
|
| + } else /* plane 3..7 */ if(converterData->version == 1) {
|
| + g = 3;
|
| + } else {
|
| + /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
|
| + len = 0;
|
| + }
|
| + }
|
| + } else {
|
| + /* GB2312_1 or ISO-IR-165 */
|
| + U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
|
| + len2 = MBCS_FROM_UCHAR32_ISO2022(
|
| + converterData->myConverterArray[cs0],
|
| + sourceChar,
|
| + &value,
|
| + useFallback,
|
| + MBCS_OUTPUT_2);
|
| + if(len2 == 2 || (len2 == -2 && len == 0)) {
|
| + targetValue = value;
|
| + len = len2;
|
| + cs = cs0;
|
| + g = 1;
|
| + useFallback = FALSE;
|
| + }
|
| + }
|
| + }
|
| + }
|
| +
|
| + if(len != 0) {
|
| + len = 0; /* count output bytes; it must have been abs(len) == 2 */
|
| +
|
| + /* write the designation sequence if necessary */
|
| + if(cs != pFromU2022State->cs[g]) {
|
| + if(cs < CNS_11643) {
|
| + uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
|
| + } else {
|
| + U_ASSERT(cs >= CNS_11643_1);
|
| + uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
|
| + }
|
| + len = 4;
|
| + pFromU2022State->cs[g] = cs;
|
| + if(g == 1) {
|
| + /* changing the SO/G1 charset invalidates the choices[] */
|
| + choiceCount = 0;
|
| + }
|
| + }
|
| +
|
| + /* write the shift sequence if necessary */
|
| + if(g != pFromU2022State->g) {
|
| + switch(g) {
|
| + case 1:
|
| + buffer[len++] = UCNV_SO;
|
| +
|
| + /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
|
| + pFromU2022State->g = 1;
|
| + break;
|
| + case 2:
|
| + buffer[len++] = 0x1b;
|
| + buffer[len++] = 0x4e;
|
| + break;
|
| + default: /* case 3 */
|
| + buffer[len++] = 0x1b;
|
| + buffer[len++] = 0x4f;
|
| + break;
|
| + }
|
| + }
|
| +
|
| + /* write the two output bytes */
|
| + buffer[len++] = (char)(targetValue >> 8);
|
| + buffer[len++] = (char)targetValue;
|
| + } else {
|
| + /* if we cannot find the character after checking all codepages
|
| + * then this is an error
|
| + */
|
| + *err = U_INVALID_CHAR_FOUND;
|
| + cnv->fromUChar32=sourceChar;
|
| + break;
|
| + }
|
| + }
|
| +
|
| + /* output len>0 bytes in buffer[] */
|
| + if(len == 1) {
|
| + *target++ = buffer[0];
|
| + if(offsets) {
|
| + *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
|
| + }
|
| + } else if(len == 2 && (target + 2) <= targetLimit) {
|
| + *target++ = buffer[0];
|
| + *target++ = buffer[1];
|
| + if(offsets) {
|
| + int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
|
| + *offsets++ = sourceIndex;
|
| + *offsets++ = sourceIndex;
|
| + }
|
| + } else {
|
| + fromUWriteUInt8(
|
| + cnv,
|
| + buffer, len,
|
| + &target, (const char *)targetLimit,
|
| + &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
|
| + err);
|
| + if(U_FAILURE(*err)) {
|
| + break;
|
| + }
|
| + }
|
| + } /* end if(myTargetIndex<myTargetLength) */
|
| + else{
|
| + *err =U_BUFFER_OVERFLOW_ERROR;
|
| + break;
|
| + }
|
| +
|
| + }/* end while(mySourceIndex<mySourceLength) */
|
| +
|
| + /*
|
| + * the end of the input stream and detection of truncated input
|
| + * are handled by the framework, but for ISO-2022-CN conversion
|
| + * we need to be in ASCII mode at the very end
|
| + *
|
| + * conditions:
|
| + * successful
|
| + * not in ASCII mode
|
| + * end of input and no truncated input
|
| + */
|
| + if( U_SUCCESS(*err) &&
|
| + pFromU2022State->g!=0 &&
|
| + args->flush && source>=sourceLimit && cnv->fromUChar32==0
|
| + ) {
|
| + int32_t sourceIndex;
|
| +
|
| + /* we are switching to ASCII */
|
| + pFromU2022State->g=0;
|
| +
|
| + /* get the source index of the last input character */
|
| + /*
|
| + * TODO this would be simpler and more reliable if we used a pair
|
| + * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
|
| + * so that we could simply use the prevSourceIndex here;
|
| + * this code gives an incorrect result for the rare case of an unmatched
|
| + * trail surrogate that is alone in the last buffer of the text stream
|
| + */
|
| + sourceIndex=(int32_t)(source-args->source);
|
| + if(sourceIndex>0) {
|
| + --sourceIndex;
|
| + if( U16_IS_TRAIL(args->source[sourceIndex]) &&
|
| + (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
|
| + ) {
|
| + --sourceIndex;
|
| + }
|
| + } else {
|
| + sourceIndex=-1;
|
| + }
|
| +
|
| + fromUWriteUInt8(
|
| + cnv,
|
| + SHIFT_IN_STR, 1,
|
| + &target, (const char *)targetLimit,
|
| + &offsets, sourceIndex,
|
| + err);
|
| + }
|
| +
|
| + /*save the state and return */
|
| + args->source = source;
|
| + args->target = (char*)target;
|
| +}
|
| +
|
| +
|
| +static void
|
| +UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
| + UErrorCode* err){
|
| + char tempBuf[3];
|
| + const char *mySource = (char *) args->source;
|
| + UChar *myTarget = args->target;
|
| + const char *mySourceLimit = args->sourceLimit;
|
| + uint32_t targetUniChar = 0x0000;
|
| + uint32_t mySourceChar = 0x0000;
|
| + UConverterDataISO2022* myData;
|
| + ISO2022State *pToU2022State;
|
| +
|
| + myData=(UConverterDataISO2022*)(args->converter->extraInfo);
|
| + pToU2022State = &myData->toU2022State;
|
| +
|
| + if(myData->key != 0) {
|
| + /* continue with a partial escape sequence */
|
| + goto escape;
|
| + } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
|
| + /* continue with a partial double-byte character */
|
| + mySourceChar = args->converter->toUBytes[0];
|
| + args->converter->toULength = 0;
|
| + targetUniChar = missingCharMarker;
|
| + goto getTrailByte;
|
| + }
|
| +
|
| + while(mySource < mySourceLimit){
|
| +
|
| + targetUniChar =missingCharMarker;
|
| +
|
| + if(myTarget < args->targetLimit){
|
| +
|
| + mySourceChar= (unsigned char) *mySource++;
|
| +
|
| + switch(mySourceChar){
|
| + case UCNV_SI:
|
| + pToU2022State->g=0;
|
| + if (myData->isEmptySegment) {
|
| + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| + args->converter->toUBytes[0] = mySourceChar;
|
| + args->converter->toULength = 1;
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| + return;
|
| + }
|
| + continue;
|
| +
|
| + case UCNV_SO:
|
| + if(pToU2022State->cs[1] != 0) {
|
| + pToU2022State->g=1;
|
| + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
|
| + continue;
|
| + } else {
|
| + /* illegal to have SO before a matching designator */
|
| + myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
|
| + break;
|
| + }
|
| +
|
| + case ESC_2022:
|
| + mySource--;
|
| +escape:
|
| + {
|
| + const char * mySourceBefore = mySource;
|
| + int8_t toULengthBefore = args->converter->toULength;
|
| +
|
| + changeState_2022(args->converter,&(mySource),
|
| + mySourceLimit, ISO_2022_CN,err);
|
| +
|
| + /* After SO there must be at least one character before a designator (designator error handled separately) */
|
| + if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| + args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
|
| + }
|
| + }
|
| +
|
| + /* invalid or illegal escape sequence */
|
| + if(U_FAILURE(*err)){
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| + myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
|
| + return;
|
| + }
|
| + continue;
|
| +
|
| + /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
|
| +
|
| + case CR:
|
| + /*falls through*/
|
| + case LF:
|
| + uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
|
| + /* falls through */
|
| + default:
|
| + /* convert one or two bytes */
|
| + myData->isEmptySegment = FALSE;
|
| + if(pToU2022State->g != 0) {
|
| + if(mySource < mySourceLimit) {
|
| + UConverterSharedData *cnv;
|
| + StateEnum tempState;
|
| + int32_t tempBufLen;
|
| + int leadIsOk, trailIsOk;
|
| + uint8_t trailByte;
|
| +getTrailByte:
|
| + trailByte = (uint8_t)*mySource;
|
| + /*
|
| + * Ticket 5691: consistent illegal sequences:
|
| + * - We include at least the first byte in the illegal sequence.
|
| + * - If any of the non-initial bytes could be the start of a character,
|
| + * we stop the illegal sequence before the first one of those.
|
| + *
|
| + * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
|
| + * an ESC/SO/SI, we report only the first byte as the illegal sequence.
|
| + * Otherwise we convert or report the pair of bytes.
|
| + */
|
| + leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
|
| + trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
|
| + if (leadIsOk && trailIsOk) {
|
| + ++mySource;
|
| + tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
|
| + if(tempState >= CNS_11643_0) {
|
| + cnv = myData->myConverterArray[CNS_11643];
|
| + tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
|
| + tempBuf[1] = (char) (mySourceChar);
|
| + tempBuf[2] = (char) trailByte;
|
| + tempBufLen = 3;
|
| +
|
| + }else{
|
| + U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
|
| + cnv = myData->myConverterArray[tempState];
|
| + tempBuf[0] = (char) (mySourceChar);
|
| + tempBuf[1] = (char) trailByte;
|
| + tempBufLen = 2;
|
| + }
|
| + targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
|
| + mySourceChar = (mySourceChar << 8) | trailByte;
|
| + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
|
| + /* report a pair of illegal bytes if the second byte is not a DBCS starter */
|
| + ++mySource;
|
| + /* add another bit so that the code below writes 2 bytes in case of error */
|
| + mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
|
| + }
|
| + if(pToU2022State->g>=2) {
|
| + /* return from a single-shift state to the previous one */
|
| + pToU2022State->g=pToU2022State->prevG;
|
| + }
|
| + } else {
|
| + args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
| + args->converter->toULength = 1;
|
| + goto endloop;
|
| + }
|
| + }
|
| + else{
|
| + if(mySourceChar <= 0x7f) {
|
| + targetUniChar = (UChar) mySourceChar;
|
| + }
|
| + }
|
| + break;
|
| + }
|
| + if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
|
| + if(args->offsets){
|
| + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
|
| + }
|
| + *(myTarget++)=(UChar)targetUniChar;
|
| + }
|
| + else if(targetUniChar > missingCharMarker){
|
| + /* disassemble the surrogate pair and write to output*/
|
| + targetUniChar-=0x0010000;
|
| + *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
|
| + if(args->offsets){
|
| + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
|
| + }
|
| + ++myTarget;
|
| + if(myTarget< args->targetLimit){
|
| + *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
|
| + if(args->offsets){
|
| + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
|
| + }
|
| + ++myTarget;
|
| + }else{
|
| + args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
|
| + (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
|
| + }
|
| +
|
| + }
|
| + else{
|
| + /* Call the callback function*/
|
| + toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
|
| + break;
|
| + }
|
| + }
|
| + else{
|
| + *err =U_BUFFER_OVERFLOW_ERROR;
|
| + break;
|
| + }
|
| + }
|
| +endloop:
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| +}
|
| +
|
| +static void
|
| +_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
|
| + UConverter *cnv = args->converter;
|
| + UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
|
| + ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
|
| + char *p, *subchar;
|
| + char buffer[8];
|
| + int32_t length;
|
| +
|
| + subchar=(char *)cnv->subChars;
|
| + length=cnv->subCharLen; /* assume length==1 for most variants */
|
| +
|
| + p = buffer;
|
| + switch(myConverterData->locale[0]){
|
| + case 'j':
|
| + {
|
| + int8_t cs;
|
| +
|
| + if(pFromU2022State->g == 1) {
|
| + /* JIS7: switch from G1 to G0 */
|
| + pFromU2022State->g = 0;
|
| + *p++ = UCNV_SI;
|
| + }
|
| +
|
| + cs = pFromU2022State->cs[0];
|
| + if(cs != ASCII && cs != JISX201) {
|
| + /* not in ASCII or JIS X 0201: switch to ASCII */
|
| + pFromU2022State->cs[0] = (int8_t)ASCII;
|
| + *p++ = '\x1b';
|
| + *p++ = '\x28';
|
| + *p++ = '\x42';
|
| + }
|
| +
|
| + *p++ = subchar[0];
|
| + break;
|
| + }
|
| + case 'c':
|
| + if(pFromU2022State->g != 0) {
|
| + /* not in ASCII mode: switch to ASCII */
|
| + pFromU2022State->g = 0;
|
| + *p++ = UCNV_SI;
|
| + }
|
| + *p++ = subchar[0];
|
| + break;
|
| + case 'k':
|
| + if(myConverterData->version == 0) {
|
| + if(length == 1) {
|
| + if((UBool)args->converter->fromUnicodeStatus) {
|
| + /* in DBCS mode: switch to SBCS */
|
| + args->converter->fromUnicodeStatus = 0;
|
| + *p++ = UCNV_SI;
|
| + }
|
| + *p++ = subchar[0];
|
| + } else /* length == 2*/ {
|
| + if(!(UBool)args->converter->fromUnicodeStatus) {
|
| + /* in SBCS mode: switch to DBCS */
|
| + args->converter->fromUnicodeStatus = 1;
|
| + *p++ = UCNV_SO;
|
| + }
|
| + *p++ = subchar[0];
|
| + *p++ = subchar[1];
|
| + }
|
| + break;
|
| + } else {
|
| + /* save the subconverter's substitution string */
|
| + uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
|
| + int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
|
| +
|
| + /* set our substitution string into the subconverter */
|
| + myConverterData->currentConverter->subChars = (uint8_t *)subchar;
|
| + myConverterData->currentConverter->subCharLen = (int8_t)length;
|
| +
|
| + /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
|
| + args->converter = myConverterData->currentConverter;
|
| + myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
|
| + ucnv_cbFromUWriteSub(args, 0, err);
|
| + cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
|
| + args->converter = cnv;
|
| +
|
| + /* restore the subconverter's substitution string */
|
| + myConverterData->currentConverter->subChars = currentSubChars;
|
| + myConverterData->currentConverter->subCharLen = currentSubCharLen;
|
| +
|
| + if(*err == U_BUFFER_OVERFLOW_ERROR) {
|
| + if(myConverterData->currentConverter->charErrorBufferLength > 0) {
|
| + uprv_memcpy(
|
| + cnv->charErrorBuffer,
|
| + myConverterData->currentConverter->charErrorBuffer,
|
| + myConverterData->currentConverter->charErrorBufferLength);
|
| + }
|
| + cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
|
| + myConverterData->currentConverter->charErrorBufferLength = 0;
|
| + }
|
| + return;
|
| + }
|
| + default:
|
| + /* not expected */
|
| + break;
|
| + }
|
| + ucnv_cbFromUWriteBytes(args,
|
| + buffer, (int32_t)(p - buffer),
|
| + offsetIndex, err);
|
| +}
|
| +
|
| +/*
|
| + * Structure for cloning an ISO 2022 converter into a single memory block.
|
| + * ucnv_safeClone() of the converter will align the entire cloneStruct,
|
| + * and then ucnv_safeClone() of the sub-converter may additionally align
|
| + * currentConverter inside the cloneStruct, for which we need the deadSpace
|
| + * after currentConverter.
|
| + * This is because UAlignedMemory may be larger than the actually
|
| + * necessary alignment size for the platform.
|
| + * The other cloneStruct fields will not be moved around,
|
| + * and are aligned properly with cloneStruct's alignment.
|
| + */
|
| +struct cloneStruct
|
| +{
|
| + UConverter cnv;
|
| + UConverter currentConverter;
|
| + UAlignedMemory deadSpace;
|
| + UConverterDataISO2022 mydata;
|
| +};
|
| +
|
| +
|
| +static UConverter *
|
| +_ISO_2022_SafeClone(
|
| + const UConverter *cnv,
|
| + void *stackBuffer,
|
| + int32_t *pBufferSize,
|
| + UErrorCode *status)
|
| +{
|
| + struct cloneStruct * localClone;
|
| + UConverterDataISO2022 *cnvData;
|
| + int32_t i, size;
|
| +
|
| + if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
|
| + *pBufferSize = (int32_t)sizeof(struct cloneStruct);
|
| + return NULL;
|
| + }
|
| +
|
| + cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
|
| + localClone = (struct cloneStruct *)stackBuffer;
|
| +
|
| + /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
|
| +
|
| + uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
|
| + localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
|
| + localClone->cnv.isExtraLocal = TRUE;
|
| +
|
| + /* share the subconverters */
|
| +
|
| + if(cnvData->currentConverter != NULL) {
|
| + size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
|
| + localClone->mydata.currentConverter =
|
| + ucnv_safeClone(cnvData->currentConverter,
|
| + &localClone->currentConverter,
|
| + &size, status);
|
| + if(U_FAILURE(*status)) {
|
| + return NULL;
|
| + }
|
| + }
|
| +
|
| + for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
|
| + if(cnvData->myConverterArray[i] != NULL) {
|
| + ucnv_incrementRefCount(cnvData->myConverterArray[i]);
|
| + }
|
| + }
|
| +
|
| + return &localClone->cnv;
|
| +}
|
| +
|
| +static void
|
| +_ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
| + const USetAdder *sa,
|
| + UConverterUnicodeSet which,
|
| + UErrorCode *pErrorCode)
|
| +{
|
| + int32_t i;
|
| + UConverterDataISO2022* cnvData;
|
| +
|
| + if (U_FAILURE(*pErrorCode)) {
|
| + return;
|
| + }
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| + if (cnv->sharedData == &_ISO2022Data) {
|
| + /* We use UTF-8 in this case */
|
| + sa->addRange(sa->set, 0, 0xd7FF);
|
| + sa->addRange(sa->set, 0xE000, 0x10FFFF);
|
| + return;
|
| + }
|
| +#endif
|
| +
|
| + cnvData = (UConverterDataISO2022*)cnv->extraInfo;
|
| +
|
| + /* open a set and initialize it with code points that are algorithmically round-tripped */
|
| + switch(cnvData->locale[0]){
|
| + case 'j':
|
| + /* include JIS X 0201 which is hardcoded */
|
| + sa->add(sa->set, 0xa5);
|
| + sa->add(sa->set, 0x203e);
|
| + if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
|
| + /* include Latin-1 for some variants of JP */
|
| + sa->addRange(sa->set, 0, 0xff);
|
| + } else {
|
| + /* include ASCII for JP */
|
| + sa->addRange(sa->set, 0, 0x7f);
|
| + }
|
| + if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
|
| + /*
|
| + * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
|
| + * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
|
| + * use half-width Katakana.
|
| + * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
|
| + * half-width Katakana via the ESC ( I sequence.
|
| + * However, we only emit (fromUnicode) half-width Katakana according to the
|
| + * definition of each variant.
|
| + *
|
| + * When including fallbacks,
|
| + * we need to include half-width Katakana Unicode code points for all JP variants because
|
| + * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
|
| + */
|
| + /* include half-width Katakana for JP */
|
| + sa->addRange(sa->set, HWKANA_START, HWKANA_END);
|
| + }
|
| + break;
|
| + case 'c':
|
| + case 'z':
|
| + /* include ASCII for CN */
|
| + sa->addRange(sa->set, 0, 0x7f);
|
| + break;
|
| + case 'k':
|
| + /* there is only one converter for KR, and it is not in the myConverterArray[] */
|
| + cnvData->currentConverter->sharedData->impl->getUnicodeSet(
|
| + cnvData->currentConverter, sa, which, pErrorCode);
|
| + /* the loop over myConverterArray[] will simply not find another converter */
|
| + break;
|
| + default:
|
| + break;
|
| + }
|
| +
|
| +#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
|
| + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
|
| + cnvData->version==0 && i==CNS_11643
|
| + ) {
|
| + /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
|
| + ucnv_MBCSGetUnicodeSetForBytes(
|
| + cnvData->myConverterArray[i],
|
| + sa, UCNV_ROUNDTRIP_SET,
|
| + 0, 0x81, 0x82,
|
| + pErrorCode);
|
| + }
|
| +#endif
|
| +
|
| + for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
|
| + UConverterSetFilter filter;
|
| + if(cnvData->myConverterArray[i]!=NULL) {
|
| + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
|
| + cnvData->version==0 && i==CNS_11643
|
| + ) {
|
| + /*
|
| + * Version-specific for CN:
|
| + * CN version 0 does not map CNS planes 3..7 although
|
| + * they are all available in the CNS conversion table;
|
| + * CN version 1 (-EXT) does map them all.
|
| + * The two versions create different Unicode sets.
|
| + */
|
| + filter=UCNV_SET_FILTER_2022_CN;
|
| + } else if(cnvData->locale[0]=='j' && i==JISX208) {
|
| + /*
|
| + * Only add code points that map to Shift-JIS codes
|
| + * corresponding to JIS X 0208.
|
| + */
|
| + filter=UCNV_SET_FILTER_SJIS;
|
| + } else if(i==KSC5601) {
|
| + /*
|
| + * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
|
| + * are broader than GR94.
|
| + */
|
| + filter=UCNV_SET_FILTER_GR94DBCS;
|
| + } else {
|
| + filter=UCNV_SET_FILTER_NONE;
|
| + }
|
| + ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
|
| + }
|
| + }
|
| +
|
| + /*
|
| + * ISO 2022 converters must not convert SO/SI/ESC despite what
|
| + * sub-converters do by themselves.
|
| + * Remove these characters from the set.
|
| + */
|
| + sa->remove(sa->set, 0x0e);
|
| + sa->remove(sa->set, 0x0f);
|
| + sa->remove(sa->set, 0x1b);
|
| +
|
| + /* ISO 2022 converters do not convert C1 controls either */
|
| + sa->removeRange(sa->set, 0x80, 0x9f);
|
| +}
|
| +
|
| +static const UConverterImpl _ISO2022Impl={
|
| + UCNV_ISO_2022,
|
| +
|
| + NULL,
|
| + NULL,
|
| +
|
| + _ISO2022Open,
|
| + _ISO2022Close,
|
| + _ISO2022Reset,
|
| +
|
| +#ifdef U_ENABLE_GENERIC_ISO_2022
|
| + T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
|
| + T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
|
| + ucnv_fromUnicode_UTF8,
|
| + ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
|
| +#else
|
| + NULL,
|
| + NULL,
|
| + NULL,
|
| + NULL,
|
| +#endif
|
| + NULL,
|
| +
|
| + NULL,
|
| + _ISO2022getName,
|
| + _ISO_2022_WriteSub,
|
| + _ISO_2022_SafeClone,
|
| + _ISO_2022_GetUnicodeSet,
|
| +
|
| + NULL,
|
| + NULL
|
| +};
|
| +static const UConverterStaticData _ISO2022StaticData={
|
| + sizeof(UConverterStaticData),
|
| + "ISO_2022",
|
| + 2022,
|
| + UCNV_IBM,
|
| + UCNV_ISO_2022,
|
| + 1,
|
| + 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
|
| + { 0x1a, 0, 0, 0 },
|
| + 1,
|
| + FALSE,
|
| + FALSE,
|
| + 0,
|
| + 0,
|
| + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
| +};
|
| +const UConverterSharedData _ISO2022Data={
|
| + sizeof(UConverterSharedData),
|
| + ~((uint32_t) 0),
|
| + NULL,
|
| + NULL,
|
| + &_ISO2022StaticData,
|
| + FALSE,
|
| + &_ISO2022Impl,
|
| + 0, UCNV_MBCS_TABLE_INITIALIZER
|
| +};
|
| +
|
| +/*************JP****************/
|
| +static const UConverterImpl _ISO2022JPImpl={
|
| + UCNV_ISO_2022,
|
| +
|
| + NULL,
|
| + NULL,
|
| +
|
| + _ISO2022Open,
|
| + _ISO2022Close,
|
| + _ISO2022Reset,
|
| +
|
| + UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
|
| + UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
|
| + UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
|
| + UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
|
| + NULL,
|
| +
|
| + NULL,
|
| + _ISO2022getName,
|
| + _ISO_2022_WriteSub,
|
| + _ISO_2022_SafeClone,
|
| + _ISO_2022_GetUnicodeSet,
|
| +
|
| + NULL,
|
| + NULL
|
| +};
|
| +static const UConverterStaticData _ISO2022JPStaticData={
|
| + sizeof(UConverterStaticData),
|
| + "ISO_2022_JP",
|
| + 0,
|
| + UCNV_IBM,
|
| + UCNV_ISO_2022,
|
| + 1,
|
| + 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
|
| + { 0x1a, 0, 0, 0 },
|
| + 1,
|
| + FALSE,
|
| + FALSE,
|
| + 0,
|
| + 0,
|
| + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
| +};
|
| +
|
| +namespace {
|
| +
|
| +const UConverterSharedData _ISO2022JPData={
|
| + sizeof(UConverterSharedData),
|
| + ~((uint32_t) 0),
|
| + NULL,
|
| + NULL,
|
| + &_ISO2022JPStaticData,
|
| + FALSE,
|
| + &_ISO2022JPImpl,
|
| + 0, UCNV_MBCS_TABLE_INITIALIZER
|
| +};
|
| +
|
| +} // namespace
|
| +
|
| +/************* KR ***************/
|
| +static const UConverterImpl _ISO2022KRImpl={
|
| + UCNV_ISO_2022,
|
| +
|
| + NULL,
|
| + NULL,
|
| +
|
| + _ISO2022Open,
|
| + _ISO2022Close,
|
| + _ISO2022Reset,
|
| +
|
| + UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
|
| + UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
|
| + UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
|
| + UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
|
| + NULL,
|
| +
|
| + NULL,
|
| + _ISO2022getName,
|
| + _ISO_2022_WriteSub,
|
| + _ISO_2022_SafeClone,
|
| + _ISO_2022_GetUnicodeSet,
|
| +
|
| + NULL,
|
| + NULL
|
| +};
|
| +static const UConverterStaticData _ISO2022KRStaticData={
|
| + sizeof(UConverterStaticData),
|
| + "ISO_2022_KR",
|
| + 0,
|
| + UCNV_IBM,
|
| + UCNV_ISO_2022,
|
| + 1,
|
| + 3, /* max 3 bytes per UChar: SO+DBCS */
|
| + { 0x1a, 0, 0, 0 },
|
| + 1,
|
| + FALSE,
|
| + FALSE,
|
| + 0,
|
| + 0,
|
| + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
| +};
|
| +
|
| +namespace {
|
| +
|
| +const UConverterSharedData _ISO2022KRData={
|
| + sizeof(UConverterSharedData),
|
| + ~((uint32_t) 0),
|
| + NULL,
|
| + NULL,
|
| + &_ISO2022KRStaticData,
|
| + FALSE,
|
| + &_ISO2022KRImpl,
|
| + 0, UCNV_MBCS_TABLE_INITIALIZER
|
| +};
|
| +
|
| +} // namespace
|
| +
|
| +/*************** CN ***************/
|
| +static const UConverterImpl _ISO2022CNImpl={
|
| +
|
| + UCNV_ISO_2022,
|
| +
|
| + NULL,
|
| + NULL,
|
| +
|
| + _ISO2022Open,
|
| + _ISO2022Close,
|
| + _ISO2022Reset,
|
| +
|
| + UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
|
| + UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
|
| + UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
|
| + UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
|
| + NULL,
|
| +
|
| + NULL,
|
| + _ISO2022getName,
|
| + _ISO_2022_WriteSub,
|
| + _ISO_2022_SafeClone,
|
| + _ISO_2022_GetUnicodeSet,
|
| +
|
| + NULL,
|
| + NULL
|
| +};
|
| +static const UConverterStaticData _ISO2022CNStaticData={
|
| + sizeof(UConverterStaticData),
|
| + "ISO_2022_CN",
|
| + 0,
|
| + UCNV_IBM,
|
| + UCNV_ISO_2022,
|
| + 1,
|
| + 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
|
| + { 0x1a, 0, 0, 0 },
|
| + 1,
|
| + FALSE,
|
| + FALSE,
|
| + 0,
|
| + 0,
|
| + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
| +};
|
| +
|
| +namespace {
|
| +
|
| +const UConverterSharedData _ISO2022CNData={
|
| + sizeof(UConverterSharedData),
|
| + ~((uint32_t) 0),
|
| + NULL,
|
| + NULL,
|
| + &_ISO2022CNStaticData,
|
| + FALSE,
|
| + &_ISO2022CNImpl,
|
| + 0, UCNV_MBCS_TABLE_INITIALIZER
|
| +};
|
| +
|
| +} // namespace
|
| +
|
| +#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
|
|
|
| Property changes on: icu51/source/common/ucnv2022.cpp
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|