static inline bool
IsClosingingQuote ( UniCodePoint uniChar, UniCodePoint openQuote, UniCodePoint closeQuote )
{

	if ( (uniChar == closeQuote) ||
		 ( (openQuote == UCP(0x301D)) && ((uniChar == UCP(0x301E)) || (uniChar == UCP(0x301F))) ) ) {
		return true;
	} else {
		return false;
	}

}	// IsClosingingQuote
示例#2
0
文件: SCORE.c 项目: PkmX/bsdconv
void cbconv(struct bsdconv_instance *ins){
	unsigned char *data;
	struct bsdconv_phase *this_phase=THIS_PHASE(ins);
	struct my_s *r=THIS_CODEC(ins)->priv;
	FILE *fp=r->score;
	int i;
	uint32_t ucs=0;
	unsigned char v;
	uint32_t score;
	data=this_phase->curr->data;

	if(r->scorer!=NULL){
		score=r->scorer->cbscorer(this_phase->curr);
		*(r->counter)+=score;
		if(score==0){
			this_phase->state.status=DEADEND;
			return;
		}
	}else if(fp!=NULL && this_phase->curr->len>0 && UCP(this_phase->curr->data)[0]==0x1){
		for(i=1;i<this_phase->curr->len;++i){
			ucs<<=8;
			ucs|=data[i];
		}
		fseek(fp, ucs*sizeof(unsigned char), SEEK_SET);
		fread(&v, sizeof(unsigned char), 1, fp);
		*(r->counter)+=v;
	}

	this_phase->data_tail->next=dup_data_rt(ins, this_phase->curr);
	this_phase->data_tail=this_phase->data_tail->next;
	this_phase->data_tail->next=NULL;

	this_phase->state.status=NEXTPHASE;
	return;
}
示例#3
0
void cbconv(struct bsdconv_instance *ins){
	FILE *fp=THIS_CODEC(ins)->priv;
	int i;
	ins->phase[ins->phase_index].state.status=NEXTPHASE;

	for(i=0;i<ins->phase[ins->phase_index].curr->len;++i){
		fprintf(fp, "%02X",UCP(ins->phase[ins->phase_index].curr->data)[i]);
	}
	if(ins->phase[ins->phase_index].curr->flags){
		fprintf(fp, " (");
		if(ins->phase[ins->phase_index].curr->flags & F_FREE) fprintf(fp, " FREE");
		if(ins->phase[ins->phase_index].curr->flags & F_MARK) fprintf(fp, " MARK");
		fprintf(fp, " )");
	}
	fprintf(fp, "\n");
}
示例#4
0
文件: BSDCONV.c 项目: PkmX/bsdconv
void cbconv(struct bsdconv_instance *ins){
	int i;
	char *p;
	struct bsdconv_phase *this_phase=THIS_PHASE(ins);

	this_phase->state.status=NEXTPHASE;

	DATA_MALLOC(ins, this_phase->data_tail->next);
	this_phase->data_tail=this_phase->data_tail->next;
	this_phase->data_tail->next=NULL;
	this_phase->data_tail->flags=F_FREE;

	this_phase->data_tail->len=this_phase->curr->len*2;
	p=this_phase->data_tail->data=malloc(this_phase->data_tail->len+1);
	for(i=0;i<this_phase->curr->len;++i){
		sprintf(p,"%02X", UCP(this_phase->curr->data)[i]);
		TAILIZE(p);
	}
}
示例#5
0
文件: type.c 项目: PkmX/bsdconv
int cbfilter(struct data_rt *data){
	if(data->len>0 && UCP(data->data)[0]==TYPE)
		return 1;
	else
		return 0;
}
static void
ClassifyCharacter ( XMP_StringPtr fullString, size_t offset,
					UniCharKind * charKind, size_t * charSize, UniCodePoint * uniChar )
{
	*charKind = UCK_normal; // Assume typical case.
	
	unsigned char	currByte = UnsByte ( fullString[offset] );
	
	if ( currByte < UnsByte(0x80) ) {
	
		// ----------------------------------------
		// We've got a single byte ASCII character.

		*charSize = 1;
		*uniChar = currByte;

		if ( currByte > UnsByte(0x22) ) {

			if ( currByte == UnsByte(0x2C) ) {
				*charKind = UCK_comma;
			} else if ( currByte == UnsByte(0x3B) ) {
				*charKind = UCK_semicolon;
			} else if ( (currByte == UnsByte(0x5B)) || (currByte == UnsByte(0x5D)) ) {
				*charKind = UCK_quote;	// ! ASCII '[' and ']' are used as quotes in Chinese and Korean.
			}

		} else {	// currByte <= 0x22

			if ( currByte == UnsByte(0x22) ) {
				*charKind = UCK_quote;
			} else if ( currByte == UnsByte(0x21) ) {
				*charKind = UCK_normal;
			} else if ( currByte == UnsByte(0x20) ) {
				*charKind = UCK_space;
			} else {
				*charKind = UCK_control;
			}

		}

	} else {	// currByte >= 0x80
	
		// ---------------------------------------------------------------------------------------
		// We've got a multibyte Unicode character. The first byte has the number of bytes and the
		// highest order bits. The other bytes each add 6 more bits. Compose the UTF-32 form so we
		// can classify directly with the Unicode code points. Order the upperBits tests to be
		// fastest for Japan, probably the most common non-ASCII usage.
		
		*charSize = 0;
		*uniChar = currByte;
		while ( (*uniChar & 0x80) != 0 ) {	// Count the leading 1 bits in the byte.
			++(*charSize);
			*uniChar = *uniChar << 1;
		}
		XMP_Assert ( (offset + *charSize) <= strlen(fullString) );
		
		*uniChar = *uniChar & 0x7F;			// Put the character bits in the bottom of uniChar.
		*uniChar = *uniChar >> *charSize;
		
		for ( size_t i = (offset + 1); i < (offset + *charSize); ++i ) {
			*uniChar = (*uniChar << 6) | (UnsByte(fullString[i]) & 0x3F);
		}
		
		XMP_Uns32 upperBits = *uniChar >> 8;	// First filter on just the high order 24 bits.

		if ( upperBits == 0xFF ) {			// U+FFxx

			if ( *uniChar == UCP(0xFF0C) ) {
				*charKind = UCK_comma;			// U+FF0C, full width comma.
			} else if ( *uniChar == UCP(0xFF1B) ) {
				*charKind = UCK_semicolon;		// U+FF1B, full width semicolon.
			} else if ( *uniChar == UCP(0xFF64) ) {
				*charKind = UCK_comma;			// U+FF64, half width ideographic comma.
			}

		} else if ( upperBits == 0xFE ) {	// U+FE--

			if ( *uniChar == UCP(0xFE50) ) {
				*charKind = UCK_comma;			// U+FE50, small comma.
			} else if ( *uniChar == UCP(0xFE51) ) {
				*charKind = UCK_comma;			// U+FE51, small ideographic comma.
			} else if ( *uniChar == UCP(0xFE54) ) {
				*charKind = UCK_semicolon;		// U+FE54, small semicolon.
			}

		} else if ( upperBits == 0x30 ) {	// U+30--

			if ( *uniChar == UCP(0x3000) ) {
				*charKind = UCK_space;			// U+3000, ideographic space.
			} else if ( *uniChar == UCP(0x3001) ) {
				*charKind = UCK_comma;			// U+3001, ideographic comma.
			} else if ( (UCP(0x3008) <= *uniChar) && (*uniChar <= UCP(0x300F)) ) {
				*charKind = UCK_quote;			// U+3008..U+300F, various quotes.
			} else if ( *uniChar == UCP(0x303F) ) {
				*charKind = UCK_space;			// U+303F, ideographic half fill space.
			} else if ( (UCP(0x301D) <= *uniChar) && (*uniChar <= UCP(0x301F)) ) {
				*charKind = UCK_quote;			// U+301D..U+301F, double prime quotes.
			}

		} else if ( upperBits == 0x20 ) {	// U+20--

			if ( (UCP(0x2000) <= *uniChar) && (*uniChar <= UCP(0x200B)) ) {
				*charKind = UCK_space;			// U+2000..U+200B, en quad through zero width space.
			} else if ( *uniChar == UCP(0x2015) ) {
				*charKind = UCK_quote;			// U+2015, dash quote.
			} else if ( (UCP(0x2018) <= *uniChar) && (*uniChar <= UCP(0x201F)) ) {
				*charKind = UCK_quote;			// U+2018..U+201F, various quotes.
			} else if ( *uniChar == UCP(0x2028) ) {
				*charKind = UCK_control;			// U+2028, line separator.
			} else if ( *uniChar == UCP(0x2029) ) {
				*charKind = UCK_control;			// U+2029, paragraph separator.
			} else if ( (*uniChar == UCP(0x2039)) || (*uniChar == UCP(0x203A)) ) {
				*charKind = UCK_quote;			// U+2039 and U+203A, guillemet quotes.
			}

		} else if ( upperBits == 0x06 ) {	// U+06--

			if ( *uniChar == UCP(0x060C) ) {
				*charKind = UCK_comma;			// U+060C, Arabic comma.
			} else if ( *uniChar == UCP(0x061B) ) {
				*charKind = UCK_semicolon;		// U+061B, Arabic semicolon.
			}

		} else if ( upperBits == 0x05 ) {	// U+05--

			if ( *uniChar == UCP(0x055D) ) {
				*charKind = UCK_comma;			// U+055D, Armenian comma.
			}

		} else if ( upperBits == 0x03 ) {	// U+03--

			if ( *uniChar == UCP(0x037E) ) {
				*charKind = UCK_semicolon;		// U+037E, Greek "semicolon" (really a question mark).
			}

		} else if ( upperBits == 0x00 ) {	// U+00--

			if ( (*uniChar == UCP(0x00AB)) || (*uniChar == UCP(0x00BB)) ) {
				*charKind = UCK_quote;			// U+00AB and U+00BB, guillemet quotes.
			}

		}
				
	}

}	// ClassifyCharacter
static UniCodePoint
GetClosingQuote ( UniCodePoint openQuote )
{
	UniCodePoint	closeQuote;
	
	switch ( openQuote ) {

		case UCP(0x0022) : closeQuote = UCP(0x0022);	// ! U+0022 is both opening and closing.
						   break;
		case UCP(0x005B) : closeQuote = UCP(0x005D);
						   break;
		case UCP(0x00AB) : closeQuote = UCP(0x00BB);	// ! U+00AB and U+00BB are reversible.
						   break;
		case UCP(0x00BB) : closeQuote = UCP(0x00AB);
						   break;
		case UCP(0x2015) : closeQuote = UCP(0x2015);	// ! U+2015 is both opening and closing.
						   break;
		case UCP(0x2018) : closeQuote = UCP(0x2019);
						   break;
		case UCP(0x201A) : closeQuote = UCP(0x201B);
						   break;
		case UCP(0x201C) : closeQuote = UCP(0x201D);
						   break;
		case UCP(0x201E) : closeQuote = UCP(0x201F);
						   break;
		case UCP(0x2039) : closeQuote = UCP(0x203A);	// ! U+2039 and U+203A are reversible.
						   break;
		case UCP(0x203A) : closeQuote = UCP(0x2039);
						   break;
		case UCP(0x3008) : closeQuote = UCP(0x3009);
						   break;
		case UCP(0x300A) : closeQuote = UCP(0x300B);
						   break;
		case UCP(0x300C) : closeQuote = UCP(0x300D);
						   break;
		case UCP(0x300E) : closeQuote = UCP(0x300F);
						   break;
		case UCP(0x301D) : closeQuote = UCP(0x301F);	// ! U+301E also closes U+301D.
						   break;
		default			 : closeQuote = 0;
						   break;

	}
	
	return closeQuote;
	
}	// GetClosingQuote
示例#8
0
文件: ESCAPE.c 项目: PkmX/bsdconv
void cbconv(struct bsdconv_instance *ins){
	struct bsdconv_phase *this_phase=THIS_PHASE(ins);
	struct my_s *t=THIS_CODEC(ins)->priv;
	int i;
	unsigned int u;
	char *p;

	if(t->filter==1 && this_phase->curr->len>1 && UCP(this_phase->curr->data)[0]==1){ //unicode
		if(t->mode==16){
			DATA_MALLOC(ins, this_phase->data_tail->next);
			this_phase->data_tail=this_phase->data_tail->next;
			this_phase->data_tail->next=NULL;
			this_phase->data_tail->flags=F_FREE;
			this_phase->data_tail->len=(this_phase->curr->len-1)*2+t->prefix.len + t->suffix.len;
			this_phase->data_tail->data=malloc(this_phase->data_tail->len+1);
			memcpy(this_phase->data_tail->data, t->prefix.data, t->prefix.len);
			p=this_phase->data_tail->data+t->prefix.len;

			for(i=1;i<this_phase->curr->len;++i){
				p+=sprintf(p,"%02X", UCP(this_phase->curr->data)[i]);
			}
			memcpy(p, t->suffix.data, t->suffix.len);
			ins->phase[ins->phase_index].state.status=NEXTPHASE;
		}else if(t->mode==10){
			DATA_MALLOC(ins, this_phase->data_tail->next);
			this_phase->data_tail=this_phase->data_tail->next;
			this_phase->data_tail->next=NULL;
			this_phase->data_tail->flags=F_FREE;
			this_phase->data_tail->len=(this_phase->curr->len-1)*3+t->prefix.len + t->suffix.len;
			this_phase->data_tail->data=malloc(this_phase->data_tail->len+1);
			memcpy(this_phase->data_tail->data, t->prefix.data, t->prefix.len);
			p=this_phase->data_tail->data+t->prefix.len;

			u=0;
			for(i=1;i<this_phase->curr->len;++i){
				u*=256;
				u+=UCP(this_phase->curr->data)[i];
			}
			p+=sprintf(p, "%u", u);
			memcpy(p, t->suffix.data, t->suffix.len);
			this_phase->data_tail->len=(p+t->suffix.len)-CP(this_phase->data_tail->data);
			ins->phase[ins->phase_index].state.status=NEXTPHASE;
		}else{
			ins->phase[ins->phase_index].state.status=DEADEND;
		}
	}else if(t->filter==3 && this_phase->curr->len==2 && UCP(this_phase->curr->data)[0]==3){ //byte
		if(t->mode==8){
			DATA_MALLOC(ins, this_phase->data_tail->next);
			this_phase->data_tail=this_phase->data_tail->next;
			this_phase->data_tail->next=NULL;
			this_phase->data_tail->flags=F_FREE;
			this_phase->data_tail->len=3+t->prefix.len + t->suffix.len;
			this_phase->data_tail->data=malloc(this_phase->data_tail->len+1);
			memcpy(this_phase->data_tail->data, t->prefix.data, t->prefix.len);
			p=this_phase->data_tail->data+t->prefix.len;
			i=UCP(this_phase->curr->data)[1];
			*UCP(p+2)=i%8+'0';
			i/=8;
			*UCP(p+1)=i%8+'0';
			i/=8;
			*UCP(p)=i+'0';
			memcpy(p+3, t->suffix.data, t->suffix.len);
			ins->phase[ins->phase_index].state.status=NEXTPHASE;
		}else if(t->mode==10){
			DATA_MALLOC(ins, this_phase->data_tail->next);
			this_phase->data_tail=this_phase->data_tail->next;
			this_phase->data_tail->next=NULL;
			this_phase->data_tail->flags=F_FREE;
			this_phase->data_tail->len=3+t->prefix.len + t->suffix.len;
			this_phase->data_tail->data=malloc(this_phase->data_tail->len+1);
			memcpy(this_phase->data_tail->data, t->prefix.data, t->prefix.len);
			p=this_phase->data_tail->data+t->prefix.len;
			p+=sprintf(p, "%d", UCP(this_phase->curr->data)[1]);
			memcpy(p, t->suffix.data, t->suffix.len);
			this_phase->data_tail->len=(p+t->suffix.len)-CP(this_phase->data_tail->data);
			ins->phase[ins->phase_index].state.status=NEXTPHASE;
		}else if(t->mode==16){
			DATA_MALLOC(ins, this_phase->data_tail->next);
			this_phase->data_tail=this_phase->data_tail->next;
			this_phase->data_tail->next=NULL;
			this_phase->data_tail->flags=F_FREE;
			this_phase->data_tail->len=2+t->prefix.len + t->suffix.len;
			this_phase->data_tail->data=malloc(this_phase->data_tail->len+1);
			memcpy(this_phase->data_tail->data, t->prefix.data, t->prefix.len);
			p=this_phase->data_tail->data+t->prefix.len;
			p+=sprintf(p, "%02X", UCP(this_phase->curr->data)[1]);
			memcpy(p, t->suffix.data, t->suffix.len);
			ins->phase[ins->phase_index].state.status=NEXTPHASE;
		}else{
			ins->phase[ins->phase_index].state.status=DEADEND;
		}
	}else{
		ins->phase[ins->phase_index].state.status=DEADEND;
	}

	return;
}
示例#9
0
文件: _GB18030.c 项目: PkmX/bsdconv
void cbconv(struct bsdconv_instance *ins){
	struct bsdconv_phase *this_phase=THIS_PHASE(ins);
	struct my_s *t=THIS_CODEC(ins)->priv;
	unsigned char d;
	unsigned char *c;
	struct data_st data;
	int max=sizeof(gb18030_table) / sizeof(struct gb18030_data) - 1;
	int min = 0;
	int mid;
	int i;
	union {
		unsigned char byte[4];
		uint32_t ucs4;
	} ucs;

	for(;this_phase->i<this_phase->curr->len;this_phase->i+=1){
		d=UCP(this_phase->curr->data)[this_phase->i];
		memcpy(&data, (char *)(this_phase->codec[this_phase->index].data_z+(uintptr_t)this_phase->state.data), sizeof(struct data_st));
		c=UCP(this_phase->codec[this_phase->index].data_z+(uintptr_t)data.data);
		next:
		switch(t->status){
			case 0:
				if(t->status<data.len){
					t->ucs=c[0]*10;
					t->status=1;
					goto next;
				}
				t->ucs=d*10;
				t->status=1;
				break;
			case 1:
				if(t->status<data.len){
					t->ucs+=c[1];
					t->ucs*=126;
					t->status=2;
					goto next;
				}
				t->ucs+=d;
				t->ucs*=126;
				t->status=2;
				break;
			case 2:
				if(t->status<data.len){
					t->ucs+=c[2];
					t->ucs*=10;
					t->status=3;
					goto next;
				}
				t->ucs+=d;
				t->ucs*=10;
				t->status=3;
				break;
			case 3:
				if(t->status<data.len){
					t->ucs+=c[3];
					t->status=0;
					goto next;
				}
				t->ucs+=d;
				t->status=0;
				if (t->ucs < gb18030_table[0].beg || t->ucs > gb18030_table[max].end){
					DEADEND();
				}else while (max >= min) {
					mid = (min + max) / 2;
					if (t->ucs > gb18030_table[mid].end)
						min = mid + 1;
					else if (t->ucs < gb18030_table[mid].beg)
						max = mid - 1;
					else{
						break;
					}
				}
				if(gb18030_table[mid].beg<=t->ucs && t->ucs<=gb18030_table[mid].end){
					ucs.ucs4=htobe32(gb18030_table[mid].off + (t->ucs - gb18030_table[mid].beg));
					for(i=0;ucs.byte[i]==0 && i<4;++i);
					DATA_MALLOC(ins, this_phase->data_tail->next);
					this_phase->data_tail=this_phase->data_tail->next;
					this_phase->data_tail->next=NULL;
					this_phase->data_tail->len=5 - i;
					this_phase->data_tail->data=c=malloc(5 - i);
					this_phase->data_tail->flags=F_FREE;
					this_phase->state.status=NEXTPHASE;
					*c=0x01;
					c+=1;
					for(;i<4;++i,c+=1){
						*c=ucs.byte[i];
					}
					return;
				}else{
					DEADEND();
				}
				break;
			default:
				DEADEND();
		}
	}
	this_phase->state.status=CONTINUE;
	return;
}