icu51/source/common/ucnvlat1.c - Issue 20882002: Check in the pristine copy of ICU 51.2

Unified Diff: icu51/source/common/ucnvlat1.c

Issue 20882002: Check in the pristine copy of ICU 51.2 (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu51/source/common/ucnvlat1.c

===================================================================

--- icu51/source/common/ucnvlat1.c (revision 0)

+++ icu51/source/common/ucnvlat1.c (revision 0)

@@ -0,0 +1,744 @@

+/*

+**********************************************************************

+* file name: ucnvlat1.cpp

+* encoding: US-ASCII

+* tab size: 8 (not used)

+* indentation:4

+* created on: 2000feb07

+* created by: Markus W. Scherer

+*/

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_CONVERSION

+#include "unicode/ucnv.h"

+#include "unicode/uset.h"

+#include "unicode/utf8.h"

+#include "ucnv_bld.h"

+#include "ucnv_cnv.h"

+/* control optimizations according to the platform */

+#define LATIN1_UNROLL_FROM_UNICODE 1

+/* ISO 8859-1 --------------------------------------------------------------- */

+/* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */

+static void

+_Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

+ UErrorCode *pErrorCode) {

+ const uint8_t *source;

+ UChar *target;

+ int32_t targetCapacity, length;

+ int32_t *offsets;

+ int32_t sourceIndex;

+ /* set up the local pointers */

+ source=(const uint8_t *)pArgs->source;

+ target=pArgs->target;

+ targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

+ offsets=pArgs->offsets;

+ sourceIndex=0;

+ /*

+ * since the conversion here is 1:1 UChar:uint8_t, we need only one counter

+ * for the minimum of the sourceLength and targetCapacity

+ */

+ length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);

+ if(length<=targetCapacity) {

+ targetCapacity=length;

+ } else {

+ /* target will be full */

+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

+ length=targetCapacity;

+ }

+ if(targetCapacity>=8) {

+ /* This loop is unrolled for speed and improved pipelining. */

+ int32_t count, loops;

+ loops=count=targetCapacity>>3;

+ length=targetCapacity&=0x7;

+ do {

+ target[0]=source[0];

+ target[1]=source[1];

+ target[2]=source[2];

+ target[3]=source[3];

+ target[4]=source[4];

+ target[5]=source[5];

+ target[6]=source[6];

+ target[7]=source[7];

+ target+=8;

+ source+=8;

+ } while(--count>0);

+ if(offsets!=NULL) {

+ do {

+ offsets[0]=sourceIndex++;

+ offsets[1]=sourceIndex++;

+ offsets[2]=sourceIndex++;

+ offsets[3]=sourceIndex++;

+ offsets[4]=sourceIndex++;

+ offsets[5]=sourceIndex++;

+ offsets[6]=sourceIndex++;

+ offsets[7]=sourceIndex++;

+ offsets+=8;

+ } while(--loops>0);

+ }

+ /* conversion loop */

+ while(targetCapacity>0) {

+ *target++=*source++;

+ --targetCapacity;

+ }

+ /* write back the updated pointers */

+ pArgs->source=(const char *)source;

+ pArgs->target=target;

+ /* set offsets */

+ if(offsets!=NULL) {

+ while(length>0) {

+ *offsets++=sourceIndex++;

+ --length;

+ }

+ pArgs->offsets=offsets;

+ }

+/* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */

+static UChar32

+_Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs,

+ UErrorCode *pErrorCode) {

+ const uint8_t *source=(const uint8_t *)pArgs->source;

+ if(source<(const uint8_t *)pArgs->sourceLimit) {

+ pArgs->source=(const char *)(source+1);

+ return *source;

+ }

+ /* no output because of empty input */

+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

+ return 0xffff;

+/* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */

+static void

+_Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

+ UErrorCode *pErrorCode) {

+ UConverter *cnv;

+ const UChar *source, *sourceLimit;

+ uint8_t *target, *oldTarget;

+ int32_t targetCapacity, length;

+ int32_t *offsets;

+ UChar32 cp;

+ UChar c, max;

+ int32_t sourceIndex;

+ /* set up the local pointers */

+ cnv=pArgs->converter;

+ source=pArgs->source;

+ sourceLimit=pArgs->sourceLimit;

+ target=oldTarget=(uint8_t *)pArgs->target;

+ targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

+ offsets=pArgs->offsets;

+ if(cnv->sharedData==&_Latin1Data) {

+ max=0xff; /* Latin-1 */

+ } else {

+ max=0x7f; /* US-ASCII */

+ }

+ /* get the converter state from UConverter */

+ cp=cnv->fromUChar32;

+ /* sourceIndex=-1 if the current character began in the previous buffer */

+ sourceIndex= cp==0 ? 0 : -1;

+ /*

+ * since the conversion here is 1:1 UChar:uint8_t, we need only one counter

+ * for the minimum of the sourceLength and targetCapacity

+ */

+ length=(int32_t)(sourceLimit-source);

+ if(length<targetCapacity) {

+ targetCapacity=length;

+ }

+ /* conversion loop */

+ if(cp!=0 && targetCapacity>0) {

+ goto getTrail;

+ }

+#if LATIN1_UNROLL_FROM_UNICODE

+ /* unroll the loop with the most common case */

+ if(targetCapacity>=16) {

+ int32_t count, loops;

+ UChar u, oredChars;

+ loops=count=targetCapacity>>4;

+ do {

+ oredChars=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ oredChars|=u=*source++;

+ *target++=(uint8_t)u;

+ /* were all 16 entries really valid? */

+ if(oredChars>max) {

+ /* no, return to the first of these 16 */

+ source-=16;

+ target-=16;

+ break;

+ }

+ } while(--count>0);

+ count=loops-count;

+ targetCapacity-=16*count;

+ if(offsets!=NULL) {

+ oldTarget+=16*count;

+ while(count>0) {

+ *offsets++=sourceIndex++;

+ --count;

+ }

+#endif

+ /* conversion loop */

+ c=0;

+ while(targetCapacity>0 && (c=*source++)<=max) {

+ /* convert the Unicode code point */

+ *target++=(uint8_t)c;

+ --targetCapacity;

+ }

+ if(c>max) {

+ cp=c;

+ if(!U_IS_SURROGATE(cp)) {

+ /* callback(unassigned) */

+ } else if(U_IS_SURROGATE_LEAD(cp)) {

+getTrail:

+ if(source<sourceLimit) {

+ /* test the following code unit */

+ UChar trail=*source;

+ if(U16_IS_TRAIL(trail)) {

+ ++source;

+ cp=U16_GET_SUPPLEMENTARY(cp, trail);

+ /* this codepage does not map supplementary code points */

+ /* callback(unassigned) */

+ } else {

+ /* this is an unmatched lead code unit (1st surrogate) */

+ /* callback(illegal) */

+ }

+ } else {

+ /* no more input */

+ cnv->fromUChar32=cp;

+ goto noMoreInput;

+ }

+ } else {

+ /* this is an unmatched trail code unit (2nd surrogate) */

+ /* callback(illegal) */

+ }

+ *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND;

+ cnv->fromUChar32=cp;

+ }

+noMoreInput:

+ /* set offsets since the start */

+ if(offsets!=NULL) {

+ size_t count=target-oldTarget;

+ while(count>0) {

+ *offsets++=sourceIndex++;

+ --count;

+ }

+ if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {

+ /* target is full */

+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

+ }

+ /* write back the updated pointers */

+ pArgs->source=source;

+ pArgs->target=(char *)target;

+ pArgs->offsets=offsets;

+/* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */

+static void

+ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,

+ UConverterToUnicodeArgs *pToUArgs,

+ UErrorCode *pErrorCode) {

+ UConverter *utf8;

+ const uint8_t *source, *sourceLimit;

+ uint8_t *target;

+ int32_t targetCapacity;

+ UChar32 c;

+ uint8_t b, t1;

+ /* set up the local pointers */

+ utf8=pToUArgs->converter;

+ source=(uint8_t *)pToUArgs->source;

+ sourceLimit=(uint8_t *)pToUArgs->sourceLimit;

+ target=(uint8_t *)pFromUArgs->target;

+ targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);

+ /* get the converter state from the UTF-8 UConverter */

+ c=(UChar32)utf8->toUnicodeStatus;

+ if(c!=0 && source<sourceLimit) {

+ if(targetCapacity==0) {

+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

+ return;

+ } else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) {

+ ++source;

+ *target++=(uint8_t)(((c&3)<<6)|t1);

+ --targetCapacity;

+ utf8->toUnicodeStatus=0;

+ utf8->toULength=0;

+ } else {

+ /* complicated, illegal or unmappable input: fall back to the pivoting implementation */

+ *pErrorCode=U_USING_DEFAULT_WARNING;

+ return;

+ }

+ /*

+ * Make sure that the last byte sequence before sourceLimit is complete

+ * or runs into a lead byte.

+ * In the conversion loop compare source with sourceLimit only once

+ * per multi-byte character.

+ * For Latin-1, adjust sourceLimit only for 1 trail byte because

+ * the conversion loop handles at most 2-byte sequences.

+ */

+ if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) {

+ --sourceLimit;

+ }

+ /* conversion loop */

+ while(source<sourceLimit) {

+ if(targetCapacity>0) {

+ b=*source++;

+ if((int8_t)b>=0) {

+ /* convert ASCII */

+ *target++=(uint8_t)b;

+ --targetCapacity;

+ } else if( /* handle U+0080..U+00FF inline */

+ b>=0xc2 && b<=0xc3 &&

+ (t1=(uint8_t)(*source-0x80)) <= 0x3f

+ ) {

+ ++source;

+ *target++=(uint8_t)(((b&3)<<6)|t1);

+ --targetCapacity;

+ } else {

+ /* complicated, illegal or unmappable input: fall back to the pivoting implementation */

+ pToUArgs->source=(char *)(source-1);

+ pFromUArgs->target=(char *)target;

+ *pErrorCode=U_USING_DEFAULT_WARNING;

+ return;

+ }

+ } else {

+ /* target is full */

+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

+ break;

+ }

+ /*

+ * The sourceLimit may have been adjusted before the conversion loop

+ * to stop before a truncated sequence.

+ * If so, then collect the truncated sequence now.

+ * For Latin-1, there is at most exactly one lead byte because of the

+ * smaller sourceLimit adjustment logic.

+ */

+ if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {

+ utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;

+ utf8->toULength=1;

+ utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1;

+ }

+ /* write back the updated pointers */

+ pToUArgs->source=(char *)source;

+ pFromUArgs->target=(char *)target;

+static void

+_Latin1GetUnicodeSet(const UConverter *cnv,

+ const USetAdder *sa,

+ UConverterUnicodeSet which,

+ UErrorCode *pErrorCode) {

+ sa->addRange(sa->set, 0, 0xff);

+static const UConverterImpl _Latin1Impl={

+ UCNV_LATIN_1,

+ NULL,

+ _Latin1ToUnicodeWithOffsets,

+ _Latin1FromUnicodeWithOffsets,

+ _Latin1GetNextUChar,

+ NULL,

+ _Latin1GetUnicodeSet,

+ NULL,

+ ucnv_Latin1FromUTF8

+};

+static const UConverterStaticData _Latin1StaticData={

+ sizeof(UConverterStaticData),

+ "ISO-8859-1",

+ 819, UCNV_IBM, UCNV_LATIN_1, 1, 1,

+ { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,

+ 0,

+ { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

+};

+const UConverterSharedData _Latin1Data={

+ sizeof(UConverterSharedData), ~((uint32_t) 0),

+ NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl,

+ 0

+};

+/* US-ASCII ----------------------------------------------------------------- */

+/* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */

+static void

+_ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

+ UErrorCode *pErrorCode) {

+ const uint8_t *source, *sourceLimit;

+ UChar *target, *oldTarget;

+ int32_t targetCapacity, length;

+ int32_t *offsets;

+ int32_t sourceIndex;

+ uint8_t c;

+ /* set up the local pointers */

+ source=(const uint8_t *)pArgs->source;

+ sourceLimit=(const uint8_t *)pArgs->sourceLimit;

+ target=oldTarget=pArgs->target;

+ targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

+ offsets=pArgs->offsets;

+ /* sourceIndex=-1 if the current character began in the previous buffer */

+ sourceIndex=0;

+ /*

+ * since the conversion here is 1:1 UChar:uint8_t, we need only one counter

+ * for the minimum of the sourceLength and targetCapacity

+ */

+ length=(int32_t)(sourceLimit-source);

+ if(length<targetCapacity) {

+ targetCapacity=length;

+ }

+ if(targetCapacity>=8) {

+ /* This loop is unrolled for speed and improved pipelining. */

+ int32_t count, loops;

+ UChar oredChars;

+ loops=count=targetCapacity>>3;

+ do {

+ oredChars=target[0]=source[0];

+ oredChars|=target[1]=source[1];

+ oredChars|=target[2]=source[2];

+ oredChars|=target[3]=source[3];

+ oredChars|=target[4]=source[4];

+ oredChars|=target[5]=source[5];

+ oredChars|=target[6]=source[6];

+ oredChars|=target[7]=source[7];

+ /* were all 16 entries really valid? */

+ if(oredChars>0x7f) {

+ /* no, return to the first of these 16 */

+ break;

+ }

+ source+=8;

+ target+=8;

+ } while(--count>0);

+ count=loops-count;

+ targetCapacity-=count*8;

+ if(offsets!=NULL) {

+ oldTarget+=count*8;

+ while(count>0) {

+ offsets[0]=sourceIndex++;

+ offsets[1]=sourceIndex++;

+ offsets[2]=sourceIndex++;

+ offsets[3]=sourceIndex++;

+ offsets[4]=sourceIndex++;

+ offsets[5]=sourceIndex++;

+ offsets[6]=sourceIndex++;

+ offsets[7]=sourceIndex++;

+ offsets+=8;

+ --count;

+ }

+ /* conversion loop */

+ c=0;

+ while(targetCapacity>0 && (c=*source++)<=0x7f) {

+ *target++=c;

+ --targetCapacity;

+ }

+ if(c>0x7f) {

+ /* callback(illegal); copy the current bytes to toUBytes[] */

+ UConverter *cnv=pArgs->converter;

+ cnv->toUBytes[0]=c;

+ cnv->toULength=1;

+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;

+ } else if(source<sourceLimit && target>=pArgs->targetLimit) {

+ /* target is full */

+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

+ }

+ /* set offsets since the start */

+ if(offsets!=NULL) {

+ size_t count=target-oldTarget;

+ while(count>0) {

+ *offsets++=sourceIndex++;

+ --count;

+ }

+ /* write back the updated pointers */

+ pArgs->source=(const char *)source;

+ pArgs->target=target;

+ pArgs->offsets=offsets;

+/* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */

+static UChar32

+_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,

+ UErrorCode *pErrorCode) {

+ const uint8_t *source;

+ uint8_t b;

+ source=(const uint8_t *)pArgs->source;

+ if(source<(const uint8_t *)pArgs->sourceLimit) {

+ b=*source++;

+ pArgs->source=(const char *)source;

+ if(b<=0x7f) {

+ return b;

+ } else {

+ UConverter *cnv=pArgs->converter;

+ cnv->toUBytes[0]=b;

+ cnv->toULength=1;

+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;

+ return 0xffff;

+ }

+ /* no output because of empty input */

+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

+ return 0xffff;

+/* "Convert" UTF-8 to US-ASCII: Validate and copy. */

+static void

+ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,

+ UConverterToUnicodeArgs *pToUArgs,

+ UErrorCode *pErrorCode) {

+ const uint8_t *source, *sourceLimit;

+ uint8_t *target;

+ int32_t targetCapacity, length;

+ uint8_t c;

+ if(pToUArgs->converter->toUnicodeStatus!=0) {

+ /* no handling of partial UTF-8 characters here, fall back to pivoting */

+ *pErrorCode=U_USING_DEFAULT_WARNING;

+ return;

+ }

+ /* set up the local pointers */

+ source=(const uint8_t *)pToUArgs->source;

+ sourceLimit=(const uint8_t *)pToUArgs->sourceLimit;

+ target=(uint8_t *)pFromUArgs->target;

+ targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);

+ /*

+ * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter

+ * for the minimum of the sourceLength and targetCapacity

+ */

+ length=(int32_t)(sourceLimit-source);

+ if(length<targetCapacity) {

+ targetCapacity=length;

+ }

+ /* unroll the loop with the most common case */

+ if(targetCapacity>=16) {

+ int32_t count, loops;

+ uint8_t oredChars;

+ loops=count=targetCapacity>>4;

+ do {

+ oredChars=*target++=*source++;

+ oredChars|=*target++=*source++;

+ /* were all 16 entries really valid? */

+ if(oredChars>0x7f) {

+ /* no, return to the first of these 16 */

+ source-=16;

+ target-=16;

+ break;

+ }

+ } while(--count>0);

+ count=loops-count;

+ targetCapacity-=16*count;

+ }

+ /* conversion loop */

+ c=0;

+ while(targetCapacity>0 && (c=*source)<=0x7f) {

+ ++source;

+ *target++=c;

+ --targetCapacity;

+ }

+ if(c>0x7f) {

+ /* non-ASCII character, handle in standard converter */

+ *pErrorCode=U_USING_DEFAULT_WARNING;

+ } else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) {

+ /* target is full */

+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

+ }

+ /* write back the updated pointers */

+ pToUArgs->source=(const char *)source;

+ pFromUArgs->target=(char *)target;

+static void

+_ASCIIGetUnicodeSet(const UConverter *cnv,

+ const USetAdder *sa,

+ UConverterUnicodeSet which,

+ UErrorCode *pErrorCode) {

+ sa->addRange(sa->set, 0, 0x7f);

+static const UConverterImpl _ASCIIImpl={

+ UCNV_US_ASCII,

+ NULL,

+ _ASCIIToUnicodeWithOffsets,

+ _Latin1FromUnicodeWithOffsets,

+ _ASCIIGetNextUChar,

+ NULL,

+ _ASCIIGetUnicodeSet,

+ NULL,

+ ucnv_ASCIIFromUTF8

+};

+static const UConverterStaticData _ASCIIStaticData={

+ sizeof(UConverterStaticData),

+ "US-ASCII",

+ 367, UCNV_IBM, UCNV_US_ASCII, 1, 1,

+ { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,

+ 0,

+ { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

+};

+const UConverterSharedData _ASCIIData={

+ sizeof(UConverterSharedData), ~((uint32_t) 0),

+ NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl,

+ 0

+};

+#endif

Property changes on: icu51/source/common/ucnvlat1.c

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu51/source/common/ucnvisci.c ('k') | icu51/source/common/ucnvmbcs.h » ('j') | no next file with comments »