/* ****************************************************************************** * * Copyright (C) 2000-2009, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: ucnvscsu.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2000nov18 * created by: Markus W. Scherer * * This is an implementation of the Standard Compression Scheme for Unicode * as defined in http://www.unicode.org/unicode/reports/tr6/ . * Reserved commands and window settings are treated as illegal sequences and * will result in callback calls. */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "unicode/ucnv.h" #include "unicode/ucnv_cb.h" #include "ucnv_bld.h" #include "ucnv_cnv.h" #include "cmemory.h" /* SCSU definitions --------------------------------------------------------- */ /* SCSU command byte values */ enum { SQ0=0x01, /* Quote from window pair 0 */ SQ7=0x08, /* Quote from window pair 7 */ SDX=0x0B, /* Define a window as extended */ Srs=0x0C, /* reserved */ SQU=0x0E, /* Quote a single Unicode character */ SCU=0x0F, /* Change to Unicode mode */ SC0=0x10, /* Select window 0 */ SC7=0x17, /* Select window 7 */ SD0=0x18, /* Define and select window 0 */ SD7=0x1F, /* Define and select window 7 */ UC0=0xE0, /* Select window 0 */ UC7=0xE7, /* Select window 7 */ UD0=0xE8, /* Define and select window 0 */ UD7=0xEF, /* Define and select window 7 */ UQU=0xF0, /* Quote a single Unicode character */ UDX=0xF1, /* Define a Window as extended */ Urs=0xF2 /* reserved */ }; enum { /* * Unicode code points from 3400 to E000 are not adressible by * dynamic window, since in these areas no short run alphabets are * found. Therefore add gapOffset to all values from gapThreshold. */ gapThreshold=0x68, gapOffset=0xAC00, /* values between reservedStart and fixedThreshold are reserved */ reservedStart=0xA8, /* use table of predefined fixed offsets for values from fixedThreshold */ fixedThreshold=0xF9 }; /* constant offsets for the 8 static windows */ static const uint32_t staticOffsets[8]={ 0x0000, /* ASCII for quoted tags */ 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ 0x0100, /* Latin Extended-A */ 0x0300, /* Combining Diacritical Marks */ 0x2000, /* General Punctuation */ 0x2080, /* Currency Symbols */ 0x2100, /* Letterlike Symbols and Number Forms */ 0x3000 /* CJK Symbols and punctuation */ }; /* initial offsets for the 8 dynamic (sliding) windows */ static const uint32_t initialDynamicOffsets[8]={ 0x0080, /* Latin-1 */ 0x00C0, /* Latin Extended A */ 0x0400, /* Cyrillic */ 0x0600, /* Arabic */ 0x0900, /* Devanagari */ 0x3040, /* Hiragana */ 0x30A0, /* Katakana */ 0xFF00 /* Fullwidth ASCII */ }; /* Table of fixed predefined Offsets */ static const uint32_t fixedOffsets[]={ /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ /* 0xFA */ 0x0250, /* IPA extensions */ /* 0xFB */ 0x0370, /* Greek */ /* 0xFC */ 0x0530, /* Armenian */ /* 0xFD */ 0x3040, /* Hiragana */ /* 0xFE */ 0x30A0, /* Katakana */ /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ }; /* state values */ enum { readCommand, quotePairOne, quotePairTwo, quoteOne, definePairOne, definePairTwo, defineOne }; typedef struct SCSUData { /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ uint32_t toUDynamicOffsets[8]; uint32_t fromUDynamicOffsets[8]; /* state machine state - toUnicode */ UBool toUIsSingleByteMode; uint8_t toUState; int8_t toUQuoteWindow, toUDynamicWindow; uint8_t toUByteOne; uint8_t toUPadding[3]; /* state machine state - fromUnicode */ UBool fromUIsSingleByteMode; int8_t fromUDynamicWindow; /* * windowUse[] keeps track of the use of the dynamic windows: * At nextWindowUseIndex there is the least recently used window, * and the following windows (in a wrapping manner) are more and more * recently used. * At nextWindowUseIndex-1 there is the most recently used window. */ uint8_t locale; int8_t nextWindowUseIndex; int8_t windowUse[8]; } SCSUData; static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; enum { lGeneric, l_ja }; /* SCSU setup functions ----------------------------------------------------- */ static void _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { SCSUData *scsu=(SCSUData *)cnv->extraInfo; if(choice<=UCNV_RESET_TO_UNICODE) { /* reset toUnicode */ uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); scsu->toUIsSingleByteMode=TRUE; scsu->toUState=readCommand; scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; scsu->toUByteOne=0; cnv->toULength=0; } if(choice!=UCNV_RESET_TO_UNICODE) { /* reset fromUnicode */ uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); scsu->fromUIsSingleByteMode=TRUE; scsu->fromUDynamicWindow=0; scsu->nextWindowUseIndex=0; switch(scsu->locale) { case l_ja: uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); break; default: uprv_memcpy(scsu->windowUse, initialWindowUse, 8); break; } cnv->fromUChar32=0; } } static void _SCSUOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *pErrorCode) { const char *locale=pArgs->locale; if(pArgs->onlyTestIsLoadable) { return; } cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); if(cnv->extraInfo!=NULL) { if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { ((SCSUData *)cnv->extraInfo)->locale=l_ja; } else { ((SCSUData *)cnv->extraInfo)->locale=lGeneric; } _SCSUReset(cnv, UCNV_RESET_BOTH); } else { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; } /* Set the substitution character U+fffd as a Unicode string. */ cnv->subUChars[0]=0xfffd; cnv->subCharLen=-1; } static void _SCSUClose(UConverter *cnv) { if(cnv->extraInfo!=NULL) { if(!cnv->isExtraLocal) { uprv_free(cnv->extraInfo); } cnv->extraInfo=NULL; } } /* SCSU-to-Unicode conversion functions ------------------------------------- */ static void _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; SCSUData *scsu; const uint8_t *source, *sourceLimit; UChar *target; const UChar *targetLimit; int32_t *offsets; UBool isSingleByteMode; uint8_t state, byteOne; int8_t quoteWindow, dynamicWindow; int32_t sourceIndex, nextSourceIndex; uint8_t b; /* set up the local pointers */ cnv=pArgs->converter; scsu=(SCSUData *)cnv->extraInfo; source=(const uint8_t *)pArgs->source; sourceLimit=(const uint8_t *)pArgs->sourceLimit; target=pArgs->target; targetLimit=pArgs->targetLimit; offsets=pArgs->offsets; /* get the state machine state */ isSingleByteMode=scsu->toUIsSingleByteMode; state=scsu->toUState; quoteWindow=scsu->toUQuoteWindow; dynamicWindow=scsu->toUDynamicWindow; byteOne=scsu->toUByteOne; /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex=state==readCommand ? 0 : -1; nextSourceIndex=0; /* * conversion "loop" * * For performance, this is not a normal C loop. * Instead, there are two code blocks for the two SCSU modes. * The function branches to either one, and a change of the mode is done with a goto to * the other branch. * * Each branch has two conventional loops: * - a fast-path loop for the most common codes in the mode * - a loop for all other codes in the mode * When the fast-path runs into a code that it cannot handle, its loop ends and it * runs into the following loop to handle the other codes. * The end of the input or output buffer is also handled by the slower loop. * The slow loop jumps (goto) to the fast-path loop again as soon as possible. * * The callback handling is done by returning with an error code. * The conversion framework actually calls the callback function. */ if(isSingleByteMode) { /* fast path for single-byte mode */ if(state==readCommand) { fastSingle: while(source=0x20) { ++source; ++nextSourceIndex; if(b<=0x7f) { /* write US-ASCII graphic character or DEL */ *target++=(UChar)b; if(offsets!=NULL) { *offsets++=sourceIndex; } } else { /* write from dynamic window */ uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); if(c<=0xffff) { *target++=(UChar)c; if(offsets!=NULL) { *offsets++=sourceIndex; } } else { /* output surrogate pair */ *target++=(UChar)(0xd7c0+(c>>10)); if(targetUCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); cnv->UCharErrorBufferLength=1; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; goto endloop; } } } sourceIndex=nextSourceIndex; } } /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ singleByteMode: while(source=targetLimit) { /* target is full */ *pErrorCode=U_BUFFER_OVERFLOW_ERROR; break; } b=*source++; ++nextSourceIndex; switch(state) { case readCommand: /* redundant conditions are commented out */ /* here: b<0x20 because otherwise we would be in fastSingle */ if((1UL<toUBytes[0]=b; cnv->toULength=1; goto endloop; } /* store the first byte of a multibyte sequence in toUBytes[] */ cnv->toUBytes[0]=b; cnv->toULength=1; break; case quotePairOne: byteOne=b; cnv->toUBytes[1]=b; cnv->toULength=2; state=quotePairTwo; break; case quotePairTwo: *target++=(UChar)((byteOne<<8)|b); if(offsets!=NULL) { *offsets++=sourceIndex; } sourceIndex=nextSourceIndex; state=readCommand; goto fastSingle; case quoteOne: if(b<0x80) { /* all static offsets are in the BMP */ *target++=(UChar)(staticOffsets[quoteWindow]+b); if(offsets!=NULL) { *offsets++=sourceIndex; } } else { /* write from dynamic window */ uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); if(c<=0xffff) { *target++=(UChar)c; if(offsets!=NULL) { *offsets++=sourceIndex; } } else { /* output surrogate pair */ *target++=(UChar)(0xd7c0+(c>>10)); if(targetUCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); cnv->UCharErrorBufferLength=1; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; goto endloop; } } } sourceIndex=nextSourceIndex; state=readCommand; goto fastSingle; case definePairOne: dynamicWindow=(int8_t)((b>>5)&7); byteOne=(uint8_t)(b&0x1f); cnv->toUBytes[1]=b; cnv->toULength=2; state=definePairTwo; break; case definePairTwo: scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); sourceIndex=nextSourceIndex; state=readCommand; goto fastSingle; case defineOne: if(b==0) { /* callback(illegal): Reserved window offset value 0 */ cnv->toUBytes[1]=b; cnv->toULength=2; goto endloop; } else if(btoUDynamicOffsets[dynamicWindow]=b<<7UL; } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; } else if(b>=fixedThreshold) { scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; } else { /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ cnv->toUBytes[1]=b; cnv->toULength=2; goto endloop; } sourceIndex=nextSourceIndex; state=readCommand; goto fastSingle; } } } else { /* fast path for Unicode mode */ if(state==readCommand) { fastUnicode: while(source+1(Urs-UC0)) { *target++=(UChar)((b<<8)|source[1]); if(offsets!=NULL) { *offsets++=sourceIndex; } sourceIndex=nextSourceIndex; nextSourceIndex+=2; source+=2; } } /* normal state machine for Unicode mode */ /* unicodeByteMode: */ while(source=targetLimit) { /* target is full */ *pErrorCode=U_BUFFER_OVERFLOW_ERROR; break; } b=*source++; ++nextSourceIndex; switch(state) { case readCommand: if((uint8_t)(b-UC0)>(Urs-UC0)) { byteOne=b; cnv->toUBytes[0]=b; cnv->toULength=1; state=quotePairTwo; } else if(/* UC0<=b && */ b<=UC7) { dynamicWindow=(int8_t)(b-UC0); sourceIndex=nextSourceIndex; isSingleByteMode=TRUE; goto fastSingle; } else if(/* UD0<=b && */ b<=UD7) { dynamicWindow=(int8_t)(b-UD0); isSingleByteMode=TRUE; cnv->toUBytes[0]=b; cnv->toULength=1; state=defineOne; goto singleByteMode; } else if(b==UDX) { isSingleByteMode=TRUE; cnv->toUBytes[0]=b; cnv->toULength=1; state=definePairOne; goto singleByteMode; } else if(b==UQU) { cnv->toUBytes[0]=b; cnv->toULength=1; state=quotePairOne; } else /* Urs */ { /* callback(illegal) */ *pErrorCode=U_ILLEGAL_CHAR_FOUND; cnv->toUBytes[0]=b; cnv->toULength=1; goto endloop; } break; case quotePairOne: byteOne=b; cnv->toUBytes[1]=b; cnv->toULength=2; state=quotePairTwo; break; case quotePairTwo: *target++=(UChar)((byteOne<<8)|b); if(offsets!=NULL) { *offsets++=sourceIndex; } sourceIndex=nextSourceIndex; state=readCommand; goto fastUnicode; } } } endloop: /* set the converter state back into UConverter */ if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { /* reset to deal with the next character */ state=readCommand; } else if(state==readCommand) { /* not in a multi-byte sequence, reset toULength */ cnv->toULength=0; } scsu->toUIsSingleByteMode=isSingleByteMode; scsu->toUState=state; scsu->toUQuoteWindow=quoteWindow; scsu->toUDynamicWindow=dynamicWindow; scsu->toUByteOne=byteOne; /* write back the updated pointers */ pArgs->source=(const char *)source; pArgs->target=target; pArgs->offsets=offsets; return; } /* * Identical to _SCSUToUnicodeWithOffsets but without offset handling. * If a change is made in the original function, then either * change this function the same way or * re-copy the original function and remove the variables * offsets, sourceIndex, and nextSourceIndex. */ static void _SCSUToUnicode(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; SCSUData *scsu; const uint8_t *source, *sourceLimit; UChar *target; const UChar *targetLimit; UBool isSingleByteMode; uint8_t state, byteOne; int8_t quoteWindow, dynamicWindow; uint8_t b; /* set up the local pointers */ cnv=pArgs->converter; scsu=(SCSUData *)cnv->extraInfo; source=(const uint8_t *)pArgs->source; sourceLimit=(const uint8_t *)pArgs->sourceLimit; target=pArgs->target; targetLimit=pArgs->targetLimit; /* get the state machine state */ isSingleByteMode=scsu->toUIsSingleByteMode; state=scsu->toUState; quoteWindow=scsu->toUQuoteWindow; dynamicWindow=scsu->toUDynamicWindow; byteOne=scsu->toUByteOne; /* * conversion "loop" * * For performance, this is not a normal C loop. * Instead, there are two code blocks for the two SCSU modes. * The function branches to either one, and a change of the mode is done with a goto to * the other branch. * * Each branch has two conventional loops: * - a fast-path loop for the most common codes in the mode * - a loop for all other codes in the mode * When the fast-path runs into a code that it cannot handle, its loop ends and it * runs into the following loop to handle the other codes. * The end of the input or output buffer is also handled by the slower loop. * The slow loop jumps (goto) to the fast-path loop again as soon as possible. * * The callback handling is done by returning with an error code. * The conversion framework actually calls the callback function. */ if(isSingleByteMode) { /* fast path for single-byte mode */ if(state==readCommand) { fastSingle: while(source=0x20) { ++source; if(b<=0x7f) { /* write US-ASCII graphic character or DEL */ *target++=(UChar)b; } else { /* write from dynamic window */ uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); if(c<=0xffff) { *target++=(UChar)c; } else { /* output surrogate pair */ *target++=(UChar)(0xd7c0+(c>>10)); if(targetUCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); cnv->UCharErrorBufferLength=1; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; goto endloop; } } } } } /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ singleByteMode: while(source=targetLimit) { /* target is full */ *pErrorCode=U_BUFFER_OVERFLOW_ERROR; break; } b=*source++; switch(state) { case readCommand: /* redundant conditions are commented out */ /* here: b<0x20 because otherwise we would be in fastSingle */ if((1UL<toUBytes[0]=b; cnv->toULength=1; goto endloop; } /* store the first byte of a multibyte sequence in toUBytes[] */ cnv->toUBytes[0]=b; cnv->toULength=1; break; case quotePairOne: byteOne=b; cnv->toUBytes[1]=b; cnv->toULength=2; state=quotePairTwo; break; case quotePairTwo: *target++=(UChar)((byteOne<<8)|b); state=readCommand; goto fastSingle; case quoteOne: if(b<0x80) { /* all static offsets are in the BMP */ *target++=(UChar)(staticOffsets[quoteWindow]+b); } else { /* write from dynamic window */ uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); if(c<=0xffff) { *target++=(UChar)c; } else { /* output surrogate pair */ *target++=(UChar)(0xd7c0+(c>>10)); if(targetUCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); cnv->UCharErrorBufferLength=1; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; goto endloop; } } } state=readCommand; goto fastSingle; case definePairOne: dynamicWindow=(int8_t)((b>>5)&7); byteOne=(uint8_t)(b&0x1f); cnv->toUBytes[1]=b; cnv->toULength=2; state=definePairTwo; break; case definePairTwo: scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); state=readCommand; goto fastSingle; case defineOne: if(b==0) { /* callback(illegal): Reserved window offset value 0 */ cnv->toUBytes[1]=b; cnv->toULength=2; goto endloop; } else if(btoUDynamicOffsets[dynamicWindow]=b<<7UL; } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; } else if(b>=fixedThreshold) { scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; } else { /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ cnv->toUBytes[1]=b; cnv->toULength=2; goto endloop; } state=readCommand; goto fastSingle; } } } else { /* fast path for Unicode mode */ if(state==readCommand) { fastUnicode: while(source+1(Urs-UC0)) { *target++=(UChar)((b<<8)|source[1]); source+=2; } } /* normal state machine for Unicode mode */ /* unicodeByteMode: */ while(source=targetLimit) { /* target is full */ *pErrorCode=U_BUFFER_OVERFLOW_ERROR; break; } b=*source++; switch(state) { case readCommand: if((uint8_t)(b-UC0)>(Urs-UC0)) { byteOne=b; cnv->toUBytes[0]=b; cnv->toULength=1; state=quotePairTwo; } else if(/* UC0<=b && */ b<=UC7) { dynamicWindow=(int8_t)(b-UC0); isSingleByteMode=TRUE; goto fastSingle; } else if(/* UD0<=b && */ b<=UD7) { dynamicWindow=(int8_t)(b-UD0); isSingleByteMode=TRUE; cnv->toUBytes[0]=b; cnv->toULength=1; state=defineOne; goto singleByteMode; } else if(b==UDX) { isSingleByteMode=TRUE; cnv->toUBytes[0]=b; cnv->toULength=1; state=definePairOne; goto singleByteMode; } else if(b==UQU) { cnv->toUBytes[0]=b; cnv->toULength=1; state=quotePairOne; } else /* Urs */ { /* callback(illegal) */ *pErrorCode=U_ILLEGAL_CHAR_FOUND; cnv->toUBytes[0]=b; cnv->toULength=1; goto endloop; } break; case quotePairOne: byteOne=b; cnv->toUBytes[1]=b; cnv->toULength=2; state=quotePairTwo; break; case quotePairTwo: *target++=(UChar)((byteOne<<8)|b); state=readCommand; goto fastUnicode; } } } endloop: /* set the converter state back into UConverter */ if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { /* reset to deal with the next character */ state=readCommand; } else if(state==readCommand) { /* not in a multi-byte sequence, reset toULength */ cnv->toULength=0; } scsu->toUIsSingleByteMode=isSingleByteMode; scsu->toUState=state; scsu->toUQuoteWindow=quoteWindow; scsu->toUDynamicWindow=dynamicWindow; scsu->toUByteOne=byteOne; /* write back the updated pointers */ pArgs->source=(const char *)source; pArgs->target=target; return; } /* SCSU-from-Unicode conversion functions ----------------------------------- */ /* * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve * reasonable results. The lookahead is minimal. * Many cases are simple: * A character fits directly into the current mode, a dynamic or static window, * or is not compressible. These cases are tested first. * Real compression heuristics are applied to the rest, in code branches for * single/Unicode mode and BMP/supplementary code points. * The heuristics used here are extremely simple. */ /* get the number of the window that this character is in, or -1 */ static int8_t getWindow(const uint32_t offsets[8], uint32_t c) { int i; for(i=0; i<8; ++i) { if((uint32_t)(c-offsets[i])<=0x7f) { return (int8_t)(i); } } return -1; } /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ static UBool isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { return (UBool)(c<=offset+0x7f && (c>=offset || (c<=0x7f && (c>=0x20 || (1UL<windowUse[scsu->nextWindowUseIndex]; if(++scsu->nextWindowUseIndex==8) { scsu->nextWindowUseIndex=0; } return window; } /* * useDynamicWindow() adjusts * windowUse[] and nextWindowUseIndex for the algorithm to choose * the next dynamic window to be defined; * a subclass may override it and provide its own algorithm. */ static void useDynamicWindow(SCSUData *scsu, int8_t window) { /* * move the existing window, which just became the most recently used one, * up in windowUse[] to nextWindowUseIndex-1 */ /* first, find the index of the window - backwards to favor the more recently used windows */ int i, j; i=scsu->nextWindowUseIndex; do { if(--i<0) { i=7; } } while(scsu->windowUse[i]!=window); /* now copy each windowUse[i+1] to [i] */ j=i+1; if(j==8) { j=0; } while(j!=scsu->nextWindowUseIndex) { scsu->windowUse[i]=scsu->windowUse[j]; i=j; if(++j==8) { j=0; } } /* finally, set the window into the most recently used index */ scsu->windowUse[i]=window; } /* * calculate the offset and the code for a dynamic window that contains the character * takes fixed offsets into account * the offset of the window is stored in the offset variable, * the code is returned * * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code */ static int getDynamicOffset(uint32_t c, uint32_t *pOffset) { int i; for(i=0; i<7; ++i) { if((uint32_t)(c-fixedOffsets[i])<=0x7f) { *pOffset=fixedOffsets[i]; return 0xf9+i; } } if(c<0x80) { /* No dynamic window for US-ASCII. */ return -1; } else if(c<0x3400 || (uint32_t)(c-0x10000)<(0x14000-0x10000) || (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) ) { /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ *pOffset=c&0x7fffff80; return (int)(c>>7); } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { /* For these characters we need to take the gapOffset into account. */ *pOffset=c&0x7fffff80; return (int)((c-gapOffset)>>7); } else { return -1; } } /* * Idea for compression: * - save SCSUData and other state before really starting work * - at endloop, see if compression could be better with just unicode mode * - don't do this if a callback has been called * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning * - different buffer handling! * * Drawback or need for corrective handling: * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible * not only for compression but also for HTML/XML documents with following charset/encoding announcers. * * How to achieve both? * - Only replace the result after an SDX or SCU? */ static void _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; SCSUData *scsu; const UChar *source, *sourceLimit; uint8_t *target; int32_t targetCapacity; int32_t *offsets; UBool isSingleByteMode; uint8_t dynamicWindow; uint32_t currentOffset; uint32_t c, delta; int32_t sourceIndex, nextSourceIndex; int32_t length; /* variables for compression heuristics */ uint32_t offset; UChar lead, trail; int code; int8_t window; /* set up the local pointers */ cnv=pArgs->converter; scsu=(SCSUData *)cnv->extraInfo; /* set up the local pointers */ source=pArgs->source; sourceLimit=pArgs->sourceLimit; target=(uint8_t *)pArgs->target; targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); offsets=pArgs->offsets; /* get the state machine state */ isSingleByteMode=scsu->fromUIsSingleByteMode; dynamicWindow=scsu->fromUDynamicWindow; currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; c=cnv->fromUChar32; /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= c==0 ? 0 : -1; nextSourceIndex=0; /* similar conversion "loop" as in toUnicode */ loop: if(isSingleByteMode) { if(c!=0 && targetCapacity>0) { goto getTrailSingle; } /* state machine for single-byte mode */ /* singleByteMode: */ while(sourcefromUDynamicOffsets, c))>=0) { /* there is a dynamic window that contains this character, change to it */ dynamicWindow=window; currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; length=2; goto outputBytes; } else if((code=getDynamicOffset(c, &offset))>=0) { /* might check if there are more characters in this window to come */ /* define an extended window with this character */ code-=0x200; dynamicWindow=getNextDynamicWindow(scsu); currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; length=4; goto outputBytes; } else { /* change to Unicode mode and output this (lead, trail) pair */ isSingleByteMode=FALSE; *target++=(uint8_t)SCU; if(offsets!=NULL) { *offsets++=sourceIndex; } --targetCapacity; c=((uint32_t)lead<<16)|trail; length=4; goto outputBytes; } } else if(c<0xa0) { /* quote C1 control character */ c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ length=2; goto outputBytes; } else if(c==0xfeff || c>=0xfff0) { /* quote signature character=byte order mark and specials */ c|=SQU<<16; length=3; goto outputBytes; } else { /* compress all other BMP characters */ if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { /* there is a window defined that contains this character - switch to it or quote from it? */ if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { /* change to dynamic window */ dynamicWindow=window; currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; length=2; goto outputBytes; } else { /* quote from dynamic window */ c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; length=2; goto outputBytes; } } else if((window=getWindow(staticOffsets, c))>=0) { /* quote from static window */ c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); length=2; goto outputBytes; } else if((code=getDynamicOffset(c, &offset))>=0) { /* define a dynamic window with this character */ dynamicWindow=getNextDynamicWindow(scsu); currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; length=3; goto outputBytes; } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) ) { /* * this character is not compressible (a BMP ideograph or similar); * switch to Unicode mode if this is the last character in the block * or there is at least one more ideograph following immediately */ isSingleByteMode=FALSE; c|=SCU<<16; length=3; goto outputBytes; } else { /* quote Unicode */ c|=SQU<<16; length=3; goto outputBytes; } } /* normal end of conversion: prepare for a new character */ c=0; sourceIndex=nextSourceIndex; } } else { if(c!=0 && targetCapacity>0) { goto getTrailUnicode; } /* state machine for Unicode mode */ /* unicodeByteMode: */ while(source=2) { *target++=(uint8_t)(c>>8); *target++=(uint8_t)c; if(offsets!=NULL) { *offsets++=sourceIndex; *offsets++=sourceIndex; } targetCapacity-=2; } else { length=2; goto outputBytes; } } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { /* compress BMP character if the following one is not an uncompressible ideograph */ if(!(sourcefromUDynamicOffsets, c))>=0) { /* there is a dynamic window that contains this character, change to it */ isSingleByteMode=TRUE; dynamicWindow=window; currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; length=2; goto outputBytes; } else if((code=getDynamicOffset(c, &offset))>=0) { /* define a dynamic window with this character */ isSingleByteMode=TRUE; dynamicWindow=getNextDynamicWindow(scsu); currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; length=3; goto outputBytes; } } /* don't know how to compress this character, just write it directly */ length=2; goto outputBytes; } else if(c<0xe000) { /* c is a surrogate */ if(UTF_IS_SURROGATE_FIRST(c)) { getTrailUnicode: lead=(UChar)c; if(sourcefromUDynamicOffsets, c))>=0 && !(sourcefromUDynamicOffsets[dynamicWindow]; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; length=2; goto outputBytes; } else if(source=0 ) { /* two supplementary characters in (probably) the same window - define an extended one */ isSingleByteMode=TRUE; code-=0x200; dynamicWindow=getNextDynamicWindow(scsu); currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; length=4; goto outputBytes; } else { /* don't know how to compress this character, just write it directly */ c=((uint32_t)lead<<16)|trail; length=4; goto outputBytes; } } else /* 0xe000<=c<0xf300 */ { /* quote to avoid SCSU tags */ c|=UQU<<16; length=3; goto outputBytes; } /* normal end of conversion: prepare for a new character */ c=0; sourceIndex=nextSourceIndex; } } endloop: /* set the converter state back into UConverter */ scsu->fromUIsSingleByteMode=isSingleByteMode; scsu->fromUDynamicWindow=dynamicWindow; cnv->fromUChar32=c; /* write back the updated pointers */ pArgs->source=source; pArgs->target=(char *)target; pArgs->offsets=offsets; return; outputBytes: /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ /* from the first if in the loop we know that targetCapacity>0 */ if(length<=targetCapacity) { if(offsets==NULL) { switch(length) { /* each branch falls through to the next one */ case 4: *target++=(uint8_t)(c>>24); case 3: *target++=(uint8_t)(c>>16); case 2: *target++=(uint8_t)(c>>8); case 1: *target++=(uint8_t)c; default: /* will never occur */ break; } } else { switch(length) { /* each branch falls through to the next one */ case 4: *target++=(uint8_t)(c>>24); *offsets++=sourceIndex; case 3: *target++=(uint8_t)(c>>16); *offsets++=sourceIndex; case 2: *target++=(uint8_t)(c>>8); *offsets++=sourceIndex; case 1: *target++=(uint8_t)c; *offsets++=sourceIndex; default: /* will never occur */ break; } } targetCapacity-=length; /* normal end of conversion: prepare for a new character */ c=0; sourceIndex=nextSourceIndex; goto loop; } else { uint8_t *p; /* * We actually do this backwards here: * In order to save an intermediate variable, we output * first to the overflow buffer what does not fit into the * regular target. */ /* we know that 0<=targetCapacitycharErrorBuffer; switch(length) { /* each branch falls through to the next one */ case 4: *p++=(uint8_t)(c>>24); case 3: *p++=(uint8_t)(c>>16); case 2: *p++=(uint8_t)(c>>8); case 1: *p=(uint8_t)c; default: /* will never occur */ break; } cnv->charErrorBufferLength=(int8_t)length; /* now output what fits into the regular target */ c>>=8*length; /* length was reduced by targetCapacity */ switch(targetCapacity) { /* each branch falls through to the next one */ case 3: *target++=(uint8_t)(c>>16); if(offsets!=NULL) { *offsets++=sourceIndex; } case 2: *target++=(uint8_t)(c>>8); if(offsets!=NULL) { *offsets++=sourceIndex; } case 1: *target++=(uint8_t)c; if(offsets!=NULL) { *offsets++=sourceIndex; } default: break; } /* target overflow */ targetCapacity=0; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; c=0; goto endloop; } } /* * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. * If a change is made in the original function, then either * change this function the same way or * re-copy the original function and remove the variables * offsets, sourceIndex, and nextSourceIndex. */ static void _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; SCSUData *scsu; const UChar *source, *sourceLimit; uint8_t *target; int32_t targetCapacity; UBool isSingleByteMode; uint8_t dynamicWindow; uint32_t currentOffset; uint32_t c, delta; int32_t length; /* variables for compression heuristics */ uint32_t offset; UChar lead, trail; int code; int8_t window; /* set up the local pointers */ cnv=pArgs->converter; scsu=(SCSUData *)cnv->extraInfo; /* set up the local pointers */ source=pArgs->source; sourceLimit=pArgs->sourceLimit; target=(uint8_t *)pArgs->target; targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); /* get the state machine state */ isSingleByteMode=scsu->fromUIsSingleByteMode; dynamicWindow=scsu->fromUDynamicWindow; currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; c=cnv->fromUChar32; /* similar conversion "loop" as in toUnicode */ loop: if(isSingleByteMode) { if(c!=0 && targetCapacity>0) { goto getTrailSingle; } /* state machine for single-byte mode */ /* singleByteMode: */ while(sourcefromUDynamicOffsets, c))>=0) { /* there is a dynamic window that contains this character, change to it */ dynamicWindow=window; currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; length=2; goto outputBytes; } else if((code=getDynamicOffset(c, &offset))>=0) { /* might check if there are more characters in this window to come */ /* define an extended window with this character */ code-=0x200; dynamicWindow=getNextDynamicWindow(scsu); currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; length=4; goto outputBytes; } else { /* change to Unicode mode and output this (lead, trail) pair */ isSingleByteMode=FALSE; *target++=(uint8_t)SCU; --targetCapacity; c=((uint32_t)lead<<16)|trail; length=4; goto outputBytes; } } else if(c<0xa0) { /* quote C1 control character */ c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ length=2; goto outputBytes; } else if(c==0xfeff || c>=0xfff0) { /* quote signature character=byte order mark and specials */ c|=SQU<<16; length=3; goto outputBytes; } else { /* compress all other BMP characters */ if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { /* there is a window defined that contains this character - switch to it or quote from it? */ if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { /* change to dynamic window */ dynamicWindow=window; currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; length=2; goto outputBytes; } else { /* quote from dynamic window */ c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; length=2; goto outputBytes; } } else if((window=getWindow(staticOffsets, c))>=0) { /* quote from static window */ c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); length=2; goto outputBytes; } else if((code=getDynamicOffset(c, &offset))>=0) { /* define a dynamic window with this character */ dynamicWindow=getNextDynamicWindow(scsu); currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; length=3; goto outputBytes; } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) ) { /* * this character is not compressible (a BMP ideograph or similar); * switch to Unicode mode if this is the last character in the block * or there is at least one more ideograph following immediately */ isSingleByteMode=FALSE; c|=SCU<<16; length=3; goto outputBytes; } else { /* quote Unicode */ c|=SQU<<16; length=3; goto outputBytes; } } /* normal end of conversion: prepare for a new character */ c=0; } } else { if(c!=0 && targetCapacity>0) { goto getTrailUnicode; } /* state machine for Unicode mode */ /* unicodeByteMode: */ while(source=2) { *target++=(uint8_t)(c>>8); *target++=(uint8_t)c; targetCapacity-=2; } else { length=2; goto outputBytes; } } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { /* compress BMP character if the following one is not an uncompressible ideograph */ if(!(sourcefromUDynamicOffsets, c))>=0) { /* there is a dynamic window that contains this character, change to it */ isSingleByteMode=TRUE; dynamicWindow=window; currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; length=2; goto outputBytes; } else if((code=getDynamicOffset(c, &offset))>=0) { /* define a dynamic window with this character */ isSingleByteMode=TRUE; dynamicWindow=getNextDynamicWindow(scsu); currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; length=3; goto outputBytes; } } /* don't know how to compress this character, just write it directly */ length=2; goto outputBytes; } else if(c<0xe000) { /* c is a surrogate */ if(UTF_IS_SURROGATE_FIRST(c)) { getTrailUnicode: lead=(UChar)c; if(sourcefromUDynamicOffsets, c))>=0 && !(sourcefromUDynamicOffsets[dynamicWindow]; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; length=2; goto outputBytes; } else if(source=0 ) { /* two supplementary characters in (probably) the same window - define an extended one */ isSingleByteMode=TRUE; code-=0x200; dynamicWindow=getNextDynamicWindow(scsu); currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; useDynamicWindow(scsu, dynamicWindow); c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; length=4; goto outputBytes; } else { /* don't know how to compress this character, just write it directly */ c=((uint32_t)lead<<16)|trail; length=4; goto outputBytes; } } else /* 0xe000<=c<0xf300 */ { /* quote to avoid SCSU tags */ c|=UQU<<16; length=3; goto outputBytes; } /* normal end of conversion: prepare for a new character */ c=0; } } endloop: /* set the converter state back into UConverter */ scsu->fromUIsSingleByteMode=isSingleByteMode; scsu->fromUDynamicWindow=dynamicWindow; cnv->fromUChar32=c; /* write back the updated pointers */ pArgs->source=source; pArgs->target=(char *)target; return; outputBytes: /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ /* from the first if in the loop we know that targetCapacity>0 */ if(length<=targetCapacity) { switch(length) { /* each branch falls through to the next one */ case 4: *target++=(uint8_t)(c>>24); case 3: *target++=(uint8_t)(c>>16); case 2: *target++=(uint8_t)(c>>8); case 1: *target++=(uint8_t)c; default: /* will never occur */ break; } targetCapacity-=length; /* normal end of conversion: prepare for a new character */ c=0; goto loop; } else { uint8_t *p; /* * We actually do this backwards here: * In order to save an intermediate variable, we output * first to the overflow buffer what does not fit into the * regular target. */ /* we know that 0<=targetCapacitycharErrorBuffer; switch(length) { /* each branch falls through to the next one */ case 4: *p++=(uint8_t)(c>>24); case 3: *p++=(uint8_t)(c>>16); case 2: *p++=(uint8_t)(c>>8); case 1: *p=(uint8_t)c; default: /* will never occur */ break; } cnv->charErrorBufferLength=(int8_t)length; /* now output what fits into the regular target */ c>>=8*length; /* length was reduced by targetCapacity */ switch(targetCapacity) { /* each branch falls through to the next one */ case 3: *target++=(uint8_t)(c>>16); case 2: *target++=(uint8_t)(c>>8); case 1: *target++=(uint8_t)c; default: break; } /* target overflow */ targetCapacity=0; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; c=0; goto endloop; } } /* miscellaneous ------------------------------------------------------------ */ static const char * _SCSUGetName(const UConverter *cnv) { SCSUData *scsu=(SCSUData *)cnv->extraInfo; switch(scsu->locale) { case l_ja: return "SCSU,locale=ja"; default: return "SCSU"; } } /* structure for SafeClone calculations */ struct cloneSCSUStruct { UConverter cnv; SCSUData mydata; }; static UConverter * _SCSUSafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status) { struct cloneSCSUStruct * localClone; int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); if (U_FAILURE(*status)){ return 0; } if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ *pBufferSize = bufferSizeNeeded; return 0; } localClone = (struct cloneSCSUStruct *)stackBuffer; /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); localClone->cnv.extraInfo = &localClone->mydata; localClone->cnv.isExtraLocal = TRUE; return &localClone->cnv; } static const UConverterImpl _SCSUImpl={ UCNV_SCSU, NULL, NULL, _SCSUOpen, _SCSUClose, _SCSUReset, _SCSUToUnicode, _SCSUToUnicodeWithOffsets, _SCSUFromUnicode, _SCSUFromUnicodeWithOffsets, NULL, NULL, _SCSUGetName, NULL, _SCSUSafeClone, ucnv_getCompleteUnicodeSet }; static const UConverterStaticData _SCSUStaticData={ sizeof(UConverterStaticData), "SCSU", 1212, /* CCSID for SCSU */ UCNV_IBM, UCNV_SCSU, 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ /* * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode * substitution string. */ { 0x0e, 0xff, 0xfd, 0 }, 3, FALSE, FALSE, 0, 0, { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ }; const UConverterSharedData _SCSUData={ sizeof(UConverterSharedData), ~((uint32_t)0), NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl, 0 }; #endif