static inline bool IsClosingingQuote ( UniCodePoint uniChar, UniCodePoint openQuote, UniCodePoint closeQuote ) { if ( (uniChar == closeQuote) || ( (openQuote == UCP(0x301D)) && ((uniChar == UCP(0x301E)) || (uniChar == UCP(0x301F))) ) ) { return true; } else { return false; } } // IsClosingingQuote
void cbconv(struct bsdconv_instance *ins){ unsigned char *data; struct bsdconv_phase *this_phase=THIS_PHASE(ins); struct my_s *r=THIS_CODEC(ins)->priv; FILE *fp=r->score; int i; uint32_t ucs=0; unsigned char v; uint32_t score; data=this_phase->curr->data; if(r->scorer!=NULL){ score=r->scorer->cbscorer(this_phase->curr); *(r->counter)+=score; if(score==0){ this_phase->state.status=DEADEND; return; } }else if(fp!=NULL && this_phase->curr->len>0 && UCP(this_phase->curr->data)[0]==0x1){ for(i=1;i<this_phase->curr->len;++i){ ucs<<=8; ucs|=data[i]; } fseek(fp, ucs*sizeof(unsigned char), SEEK_SET); fread(&v, sizeof(unsigned char), 1, fp); *(r->counter)+=v; } this_phase->data_tail->next=dup_data_rt(ins, this_phase->curr); this_phase->data_tail=this_phase->data_tail->next; this_phase->data_tail->next=NULL; this_phase->state.status=NEXTPHASE; return; }
void cbconv(struct bsdconv_instance *ins){ FILE *fp=THIS_CODEC(ins)->priv; int i; ins->phase[ins->phase_index].state.status=NEXTPHASE; for(i=0;i<ins->phase[ins->phase_index].curr->len;++i){ fprintf(fp, "%02X",UCP(ins->phase[ins->phase_index].curr->data)[i]); } if(ins->phase[ins->phase_index].curr->flags){ fprintf(fp, " ("); if(ins->phase[ins->phase_index].curr->flags & F_FREE) fprintf(fp, " FREE"); if(ins->phase[ins->phase_index].curr->flags & F_MARK) fprintf(fp, " MARK"); fprintf(fp, " )"); } fprintf(fp, "\n"); }
void cbconv(struct bsdconv_instance *ins){ int i; char *p; struct bsdconv_phase *this_phase=THIS_PHASE(ins); this_phase->state.status=NEXTPHASE; DATA_MALLOC(ins, this_phase->data_tail->next); this_phase->data_tail=this_phase->data_tail->next; this_phase->data_tail->next=NULL; this_phase->data_tail->flags=F_FREE; this_phase->data_tail->len=this_phase->curr->len*2; p=this_phase->data_tail->data=malloc(this_phase->data_tail->len+1); for(i=0;i<this_phase->curr->len;++i){ sprintf(p,"%02X", UCP(this_phase->curr->data)[i]); TAILIZE(p); } }
int cbfilter(struct data_rt *data){ if(data->len>0 && UCP(data->data)[0]==TYPE) return 1; else return 0; }
static void ClassifyCharacter ( XMP_StringPtr fullString, size_t offset, UniCharKind * charKind, size_t * charSize, UniCodePoint * uniChar ) { *charKind = UCK_normal; // Assume typical case. unsigned char currByte = UnsByte ( fullString[offset] ); if ( currByte < UnsByte(0x80) ) { // ---------------------------------------- // We've got a single byte ASCII character. *charSize = 1; *uniChar = currByte; if ( currByte > UnsByte(0x22) ) { if ( currByte == UnsByte(0x2C) ) { *charKind = UCK_comma; } else if ( currByte == UnsByte(0x3B) ) { *charKind = UCK_semicolon; } else if ( (currByte == UnsByte(0x5B)) || (currByte == UnsByte(0x5D)) ) { *charKind = UCK_quote; // ! ASCII '[' and ']' are used as quotes in Chinese and Korean. } } else { // currByte <= 0x22 if ( currByte == UnsByte(0x22) ) { *charKind = UCK_quote; } else if ( currByte == UnsByte(0x21) ) { *charKind = UCK_normal; } else if ( currByte == UnsByte(0x20) ) { *charKind = UCK_space; } else { *charKind = UCK_control; } } } else { // currByte >= 0x80 // --------------------------------------------------------------------------------------- // We've got a multibyte Unicode character. The first byte has the number of bytes and the // highest order bits. The other bytes each add 6 more bits. Compose the UTF-32 form so we // can classify directly with the Unicode code points. Order the upperBits tests to be // fastest for Japan, probably the most common non-ASCII usage. *charSize = 0; *uniChar = currByte; while ( (*uniChar & 0x80) != 0 ) { // Count the leading 1 bits in the byte. ++(*charSize); *uniChar = *uniChar << 1; } XMP_Assert ( (offset + *charSize) <= strlen(fullString) ); *uniChar = *uniChar & 0x7F; // Put the character bits in the bottom of uniChar. *uniChar = *uniChar >> *charSize; for ( size_t i = (offset + 1); i < (offset + *charSize); ++i ) { *uniChar = (*uniChar << 6) | (UnsByte(fullString[i]) & 0x3F); } XMP_Uns32 upperBits = *uniChar >> 8; // First filter on just the high order 24 bits. if ( upperBits == 0xFF ) { // U+FFxx if ( *uniChar == UCP(0xFF0C) ) { *charKind = UCK_comma; // U+FF0C, full width comma. } else if ( *uniChar == UCP(0xFF1B) ) { *charKind = UCK_semicolon; // U+FF1B, full width semicolon. } else if ( *uniChar == UCP(0xFF64) ) { *charKind = UCK_comma; // U+FF64, half width ideographic comma. } } else if ( upperBits == 0xFE ) { // U+FE-- if ( *uniChar == UCP(0xFE50) ) { *charKind = UCK_comma; // U+FE50, small comma. } else if ( *uniChar == UCP(0xFE51) ) { *charKind = UCK_comma; // U+FE51, small ideographic comma. } else if ( *uniChar == UCP(0xFE54) ) { *charKind = UCK_semicolon; // U+FE54, small semicolon. } } else if ( upperBits == 0x30 ) { // U+30-- if ( *uniChar == UCP(0x3000) ) { *charKind = UCK_space; // U+3000, ideographic space. } else if ( *uniChar == UCP(0x3001) ) { *charKind = UCK_comma; // U+3001, ideographic comma. } else if ( (UCP(0x3008) <= *uniChar) && (*uniChar <= UCP(0x300F)) ) { *charKind = UCK_quote; // U+3008..U+300F, various quotes. } else if ( *uniChar == UCP(0x303F) ) { *charKind = UCK_space; // U+303F, ideographic half fill space. } else if ( (UCP(0x301D) <= *uniChar) && (*uniChar <= UCP(0x301F)) ) { *charKind = UCK_quote; // U+301D..U+301F, double prime quotes. } } else if ( upperBits == 0x20 ) { // U+20-- if ( (UCP(0x2000) <= *uniChar) && (*uniChar <= UCP(0x200B)) ) { *charKind = UCK_space; // U+2000..U+200B, en quad through zero width space. } else if ( *uniChar == UCP(0x2015) ) { *charKind = UCK_quote; // U+2015, dash quote. } else if ( (UCP(0x2018) <= *uniChar) && (*uniChar <= UCP(0x201F)) ) { *charKind = UCK_quote; // U+2018..U+201F, various quotes. } else if ( *uniChar == UCP(0x2028) ) { *charKind = UCK_control; // U+2028, line separator. } else if ( *uniChar == UCP(0x2029) ) { *charKind = UCK_control; // U+2029, paragraph separator. } else if ( (*uniChar == UCP(0x2039)) || (*uniChar == UCP(0x203A)) ) { *charKind = UCK_quote; // U+2039 and U+203A, guillemet quotes. } } else if ( upperBits == 0x06 ) { // U+06-- if ( *uniChar == UCP(0x060C) ) { *charKind = UCK_comma; // U+060C, Arabic comma. } else if ( *uniChar == UCP(0x061B) ) { *charKind = UCK_semicolon; // U+061B, Arabic semicolon. } } else if ( upperBits == 0x05 ) { // U+05-- if ( *uniChar == UCP(0x055D) ) { *charKind = UCK_comma; // U+055D, Armenian comma. } } else if ( upperBits == 0x03 ) { // U+03-- if ( *uniChar == UCP(0x037E) ) { *charKind = UCK_semicolon; // U+037E, Greek "semicolon" (really a question mark). } } else if ( upperBits == 0x00 ) { // U+00-- if ( (*uniChar == UCP(0x00AB)) || (*uniChar == UCP(0x00BB)) ) { *charKind = UCK_quote; // U+00AB and U+00BB, guillemet quotes. } } } } // ClassifyCharacter
static UniCodePoint GetClosingQuote ( UniCodePoint openQuote ) { UniCodePoint closeQuote; switch ( openQuote ) { case UCP(0x0022) : closeQuote = UCP(0x0022); // ! U+0022 is both opening and closing. break; case UCP(0x005B) : closeQuote = UCP(0x005D); break; case UCP(0x00AB) : closeQuote = UCP(0x00BB); // ! U+00AB and U+00BB are reversible. break; case UCP(0x00BB) : closeQuote = UCP(0x00AB); break; case UCP(0x2015) : closeQuote = UCP(0x2015); // ! U+2015 is both opening and closing. break; case UCP(0x2018) : closeQuote = UCP(0x2019); break; case UCP(0x201A) : closeQuote = UCP(0x201B); break; case UCP(0x201C) : closeQuote = UCP(0x201D); break; case UCP(0x201E) : closeQuote = UCP(0x201F); break; case UCP(0x2039) : closeQuote = UCP(0x203A); // ! U+2039 and U+203A are reversible. break; case UCP(0x203A) : closeQuote = UCP(0x2039); break; case UCP(0x3008) : closeQuote = UCP(0x3009); break; case UCP(0x300A) : closeQuote = UCP(0x300B); break; case UCP(0x300C) : closeQuote = UCP(0x300D); break; case UCP(0x300E) : closeQuote = UCP(0x300F); break; case UCP(0x301D) : closeQuote = UCP(0x301F); // ! U+301E also closes U+301D. break; default : closeQuote = 0; break; } return closeQuote; } // GetClosingQuote
void cbconv(struct bsdconv_instance *ins){ struct bsdconv_phase *this_phase=THIS_PHASE(ins); struct my_s *t=THIS_CODEC(ins)->priv; int i; unsigned int u; char *p; if(t->filter==1 && this_phase->curr->len>1 && UCP(this_phase->curr->data)[0]==1){ //unicode if(t->mode==16){ DATA_MALLOC(ins, this_phase->data_tail->next); this_phase->data_tail=this_phase->data_tail->next; this_phase->data_tail->next=NULL; this_phase->data_tail->flags=F_FREE; this_phase->data_tail->len=(this_phase->curr->len-1)*2+t->prefix.len + t->suffix.len; this_phase->data_tail->data=malloc(this_phase->data_tail->len+1); memcpy(this_phase->data_tail->data, t->prefix.data, t->prefix.len); p=this_phase->data_tail->data+t->prefix.len; for(i=1;i<this_phase->curr->len;++i){ p+=sprintf(p,"%02X", UCP(this_phase->curr->data)[i]); } memcpy(p, t->suffix.data, t->suffix.len); ins->phase[ins->phase_index].state.status=NEXTPHASE; }else if(t->mode==10){ DATA_MALLOC(ins, this_phase->data_tail->next); this_phase->data_tail=this_phase->data_tail->next; this_phase->data_tail->next=NULL; this_phase->data_tail->flags=F_FREE; this_phase->data_tail->len=(this_phase->curr->len-1)*3+t->prefix.len + t->suffix.len; this_phase->data_tail->data=malloc(this_phase->data_tail->len+1); memcpy(this_phase->data_tail->data, t->prefix.data, t->prefix.len); p=this_phase->data_tail->data+t->prefix.len; u=0; for(i=1;i<this_phase->curr->len;++i){ u*=256; u+=UCP(this_phase->curr->data)[i]; } p+=sprintf(p, "%u", u); memcpy(p, t->suffix.data, t->suffix.len); this_phase->data_tail->len=(p+t->suffix.len)-CP(this_phase->data_tail->data); ins->phase[ins->phase_index].state.status=NEXTPHASE; }else{ ins->phase[ins->phase_index].state.status=DEADEND; } }else if(t->filter==3 && this_phase->curr->len==2 && UCP(this_phase->curr->data)[0]==3){ //byte if(t->mode==8){ DATA_MALLOC(ins, this_phase->data_tail->next); this_phase->data_tail=this_phase->data_tail->next; this_phase->data_tail->next=NULL; this_phase->data_tail->flags=F_FREE; this_phase->data_tail->len=3+t->prefix.len + t->suffix.len; this_phase->data_tail->data=malloc(this_phase->data_tail->len+1); memcpy(this_phase->data_tail->data, t->prefix.data, t->prefix.len); p=this_phase->data_tail->data+t->prefix.len; i=UCP(this_phase->curr->data)[1]; *UCP(p+2)=i%8+'0'; i/=8; *UCP(p+1)=i%8+'0'; i/=8; *UCP(p)=i+'0'; memcpy(p+3, t->suffix.data, t->suffix.len); ins->phase[ins->phase_index].state.status=NEXTPHASE; }else if(t->mode==10){ DATA_MALLOC(ins, this_phase->data_tail->next); this_phase->data_tail=this_phase->data_tail->next; this_phase->data_tail->next=NULL; this_phase->data_tail->flags=F_FREE; this_phase->data_tail->len=3+t->prefix.len + t->suffix.len; this_phase->data_tail->data=malloc(this_phase->data_tail->len+1); memcpy(this_phase->data_tail->data, t->prefix.data, t->prefix.len); p=this_phase->data_tail->data+t->prefix.len; p+=sprintf(p, "%d", UCP(this_phase->curr->data)[1]); memcpy(p, t->suffix.data, t->suffix.len); this_phase->data_tail->len=(p+t->suffix.len)-CP(this_phase->data_tail->data); ins->phase[ins->phase_index].state.status=NEXTPHASE; }else if(t->mode==16){ DATA_MALLOC(ins, this_phase->data_tail->next); this_phase->data_tail=this_phase->data_tail->next; this_phase->data_tail->next=NULL; this_phase->data_tail->flags=F_FREE; this_phase->data_tail->len=2+t->prefix.len + t->suffix.len; this_phase->data_tail->data=malloc(this_phase->data_tail->len+1); memcpy(this_phase->data_tail->data, t->prefix.data, t->prefix.len); p=this_phase->data_tail->data+t->prefix.len; p+=sprintf(p, "%02X", UCP(this_phase->curr->data)[1]); memcpy(p, t->suffix.data, t->suffix.len); ins->phase[ins->phase_index].state.status=NEXTPHASE; }else{ ins->phase[ins->phase_index].state.status=DEADEND; } }else{ ins->phase[ins->phase_index].state.status=DEADEND; } return; }
void cbconv(struct bsdconv_instance *ins){ struct bsdconv_phase *this_phase=THIS_PHASE(ins); struct my_s *t=THIS_CODEC(ins)->priv; unsigned char d; unsigned char *c; struct data_st data; int max=sizeof(gb18030_table) / sizeof(struct gb18030_data) - 1; int min = 0; int mid; int i; union { unsigned char byte[4]; uint32_t ucs4; } ucs; for(;this_phase->i<this_phase->curr->len;this_phase->i+=1){ d=UCP(this_phase->curr->data)[this_phase->i]; memcpy(&data, (char *)(this_phase->codec[this_phase->index].data_z+(uintptr_t)this_phase->state.data), sizeof(struct data_st)); c=UCP(this_phase->codec[this_phase->index].data_z+(uintptr_t)data.data); next: switch(t->status){ case 0: if(t->status<data.len){ t->ucs=c[0]*10; t->status=1; goto next; } t->ucs=d*10; t->status=1; break; case 1: if(t->status<data.len){ t->ucs+=c[1]; t->ucs*=126; t->status=2; goto next; } t->ucs+=d; t->ucs*=126; t->status=2; break; case 2: if(t->status<data.len){ t->ucs+=c[2]; t->ucs*=10; t->status=3; goto next; } t->ucs+=d; t->ucs*=10; t->status=3; break; case 3: if(t->status<data.len){ t->ucs+=c[3]; t->status=0; goto next; } t->ucs+=d; t->status=0; if (t->ucs < gb18030_table[0].beg || t->ucs > gb18030_table[max].end){ DEADEND(); }else while (max >= min) { mid = (min + max) / 2; if (t->ucs > gb18030_table[mid].end) min = mid + 1; else if (t->ucs < gb18030_table[mid].beg) max = mid - 1; else{ break; } } if(gb18030_table[mid].beg<=t->ucs && t->ucs<=gb18030_table[mid].end){ ucs.ucs4=htobe32(gb18030_table[mid].off + (t->ucs - gb18030_table[mid].beg)); for(i=0;ucs.byte[i]==0 && i<4;++i); DATA_MALLOC(ins, this_phase->data_tail->next); this_phase->data_tail=this_phase->data_tail->next; this_phase->data_tail->next=NULL; this_phase->data_tail->len=5 - i; this_phase->data_tail->data=c=malloc(5 - i); this_phase->data_tail->flags=F_FREE; this_phase->state.status=NEXTPHASE; *c=0x01; c+=1; for(;i<4;++i,c+=1){ *c=ucs.byte[i]; } return; }else{ DEADEND(); } break; default: DEADEND(); } } this_phase->state.status=CONTINUE; return; }