int DaoLexer_Tokenize( DaoLexer *self, const char *src, int flags ) { DString *source = DString_New(1); DVector *lexenvs = DVector_New( sizeof(int) ); DaoToken *token = DaoToken_New(); DString *literal = & token->string; char ch, *ss, hex[11] = "0x00000000"; int replace = flags & DAO_LEX_ESCAPE; int comment = flags & DAO_LEX_COMMENT; int space = flags & DAO_LEX_SPACE; int srcSize = (int)strlen( src ); int old=0, state = TOK_START; int lexenv = LEX_ENV_NORMAL; int unicoded = 0; int line = 1; int cpos = 0; int ret = 1; int it = 0; int i, m = 4; DString_SetSharing( literal, 0 ); for(it=0; it<srcSize; it++){ if( (signed char) src[it] < 0 ){ unicoded = 1; break; } } if( unicoded && daoConfig.mbs == 0 ){ DString *wcs = DString_New(0); /* http://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html */ wchar_t quotes[] = { 0x27 , 0x27 , 0x27, /* single q.m. */ 0x22 , 0x22 , 0x22, /* double q.m. */ 0x27 + 0xfee0 , 0x27 + 0xfee0 , 0x27 , /* single q.m. unicode */ 0x22 + 0xfee0 , 0x22 + 0xfee0 , 0x22 , /* double q.m. unicode */ 0x60 , 0x27 , 0x27, /* grave accent */ 0x2018 , 0x2019 , 0x27 , /* left/right single q.m. */ 0x201C , 0x201D , 0x22 /* left/right double q.m. */ }; wchar_t sl = L'\\' + 0xfee0; wchar_t stop; int i, N = 21; it = 0; DString_SetMBS( wcs, src ); while( it < wcs->size ){ // TODO: handle verbatim string! for( i=0; i<N; i+=3 ){ if( wcs->wcs[it] == quotes[i] ){ stop = quotes[i+1]; wcs->wcs[it] = quotes[i+2]; it ++; while( it < wcs->size && wcs->wcs[it] != stop ){ if( wcs->wcs[it] == sl || wcs->wcs[it] == L'\\' ){ it ++; continue; } it ++; } if( it < wcs->size ) wcs->wcs[it] = quotes[i+2]; break; } } if( it >= wcs->size ) break; if( wcs->wcs[it] == 0x3000 ){ wcs->wcs[it] = 32; /* blank space */ }else if( wcs->wcs[it] > 0xff00 && wcs->wcs[it] < 0xff5f ){ wcs->wcs[it] -= 0xfee0; /* DBC to SBC */ } it ++; } if( wcs->size ){ DString_SetWCS( source, wcs->wcs ); src = source->mbs; srcSize = source->size; } DString_Delete( wcs ); } DaoLexer_Reset( self ); DVector_PushInt( lexenvs, LEX_ENV_NORMAL ); it = 0; token->cpos = 0; while( it < srcSize ){ #if 0 printf( "tok: %i %i %i %c %s\n", srcSize, it, ch, ch, literal->mbs ); #endif token->type = state; token->name = 0; token->line = line; ch = src[it]; cpos += ch == '\t' ? daoConfig.tabspace : 1; if( ch == '\n' ) cpos = 0, line ++; if( literal->size == 0 ) token->cpos = cpos; if( state == TOK_STRING_MBS || state == TOK_STRING_WCS ){ if( ch == '\\' ){ it ++; if( replace == 0 ){ DString_AppendChar( literal, ch ); if( it < srcSize ){ if( src[it] == '\n' ) cpos = 0, line ++; DString_AppendChar( literal, src[it] ); } it ++; continue; } if( it >= srcSize ){ ret = 0; printf( "error: incomplete string at line %i.\n", line ); break; } if( src[it] == '\n' ) cpos = 0, line ++; switch( src[it] ){ case '0' : case '1' : case '2' : case '3' : case '4' : case '5' : case '6' : case '7' : /* \ooo */ i = 2; while( i < 5 && it < srcSize && src[it] >= '0' && src[it] < '8' ){ hex[i] = src[it++]; hex[++i] = 0; } DString_AppendChar( literal, (char) strtol( hex+2, NULL, 8 ) ); it --; break; case '8' : case '9' : DString_AppendChar( literal, (char) (src[it] - '0') ); break; case 'x' : case 'u' : case 'U' : i = 2; switch( src[it] ){ case 'x' : m = 4; break; /* \xhh: max 2 hex digit; */ case 'u' : m = 6; break; /* \uhhhh: max 4 hex digit; */ case 'U' : m = 10; break; /* \Uhhhhhhhh: max 8 hex digit; */ } while( i < m && (it+1) < srcSize && isxdigit( src[it+1] ) ){ hex[i] = src[++it]; hex[++i] = 0; } DString_AppendWChar( literal, (wchar_t) strtol( hex, NULL, 0 ) ); break; case 't' : DString_AppendChar( literal, '\t' ); break; case 'n' : DString_AppendChar( literal, '\n' ); break; case 'r' : DString_AppendChar( literal, '\r' ); break; case '\'' : DString_AppendChar( literal, '\'' ); break; case '\"' : DString_AppendChar( literal, '\"' ); break; default : DString_AppendChar( literal, src[it] ); break; } }else if( ch == '\'' && state == TOK_STRING_MBS ){ DString_AppendChar( literal, ch ); state = TOK_RESTART; token->type = token->name = DTOK_MBS; DaoLexer_AppendToken( self, token ); DString_Clear( literal ); }else if( ch == '\"' && state == TOK_STRING_WCS ){ DString_AppendChar( literal, ch ); state = TOK_RESTART; token->type = token->name = DTOK_WCS; DaoLexer_AppendToken( self, token ); DString_Clear( literal ); }else{ DString_AppendChar( literal, ch ); } }else if( ch == ']' && state == TOK_VERBATIM ){ int len = srcSize - it - 1; DString_AppendChar( literal, ']' ); token->type = token->name = DTOK_VBT_OPEN; if( (ss = strstr( src + it + 1, literal->mbs )) != NULL ){ len = (ss - src) - it - 1 + literal->size; token->type = token->name = DTOK_VERBATIM; } for(i=0; i<len; i++) if( src[it+1+i] == '\n' ) line += 1; DString_AppendDataMBS( literal, src + it + 1, len ); state = TOK_RESTART; DaoLexer_AppendToken( self, token ); DString_Clear( literal ); it += len; }else if( lexenv == LEX_ENV_NORMAL ){ old = state; if( ch >=0 ){ state = daoLexTable[ state ][ (int)ch ]; }else if( state <= TOK_START ){ state = TOK_RESTART; }else if( state != TOK_IDENTIFIER && state != TOK_STRING_MBS && state != TOK_STRING_WCS && state != TOK_COMT_LINE && state != TOK_COMT_OPEN ){ state = TOK_RESTART; } if( state >= TOK_END ){ DString_AppendChar( literal, ch ); token->type = token->name = daoTokenMap[ state ]; if( token->type == DTOK_ID_THTYPE || token->type == DTOK_ID_SYMBOL ) token->type = DTOK_IDENTIFIER; if( space || comment || token->type != DTOK_COMMENT ){ if( isspace( token->string.mbs[0] ) ) token->type = token->name = daoSpaceType[ (int)token->string.mbs[0] ]; DaoLexer_AppendToken( self, token ); } /* may be a token before the line break; */ DString_Clear( literal ); state = TOK_START; }else if( state == TOK_RESTART ){ if( literal->size ){ if( old == TOK_IDENTIFIER ){ token->name = dao_key_hash( literal->mbs, literal->size ); token->type = DTOK_IDENTIFIER; if( token->name == 0 ) token->name = DTOK_IDENTIFIER; DaoLexer_AppendToken( self, token ); }else if( old > TOK_RESTART && old != TOK_END ){ token->type = token->name = daoTokenMap[ old ]; if( token->type == DTOK_ID_THTYPE || token->type == DTOK_ID_SYMBOL ) token->type = DTOK_IDENTIFIER; DaoLexer_AppendToken( self, token ); }else if( space ){ if( isspace( token->string.mbs[0] ) ) token->type = token->name = daoSpaceType[ (int)token->string.mbs[0] ]; DaoLexer_AppendToken( self, token ); } DString_Clear( literal ); token->cpos = cpos; } DString_AppendChar( literal, ch ); if( ch >=0 ) state = daoLexTable[ TOK_START ][ (int)ch ]; else state = TOK_IDENTIFIER; }else if( state == TOK_COMT_OPEN ){ DString_AppendChar( literal, ch ); lexenv = LEX_ENV_COMMENT; DVector_PushInt( lexenvs, LEX_ENV_COMMENT ); }else{ DString_AppendChar( literal, ch ); } }else if( lexenv == LEX_ENV_COMMENT ){ DString_AppendChar( literal, ch ); if( ch == '#' ){ state = TOK_OP_SHARP; }else if( ch == '{' && state == TOK_OP_SHARP ){ state = TOK_COMT_OPEN; DVector_PushInt( lexenvs, LEX_ENV_COMMENT ); }else if( ch == '}' && state == TOK_OP_SHARP ){ state = TOK_COMT_CLOSE; DVector_Pop( lexenvs ); lexenv = lexenvs->data.ints[lexenvs->size-1]; if( lexenv != LEX_ENV_COMMENT ){ token->type = token->name = DTOK_COMMENT; if( comment ) DaoLexer_AppendToken( self, token ); DString_Clear( literal ); state = TOK_RESTART; } }else{ state = TOK_START; } } it ++; } if( literal->size ){ token->type = token->name = daoTokenMap[ state ]; if( lexenv == LEX_ENV_COMMENT ) token->type = token->name = DTOK_CMT_OPEN; switch( state ){ case TOK_STRING_MBS : token->type = token->name = DTOK_MBS_OPEN; break; case TOK_STRING_WCS : token->type = token->name = DTOK_WCS_OPEN; break; } if( token->type == DTOK_IDENTIFIER ){ token->name = dao_key_hash( literal->mbs, literal->size ); if( token->name == 0 ) token->name = DTOK_IDENTIFIER; }else if( token->type == DTOK_ID_THTYPE || token->type == DTOK_ID_SYMBOL ){ token->type = DTOK_IDENTIFIER; } if( token->type || space ){ if( isspace( token->string.mbs[0] ) ) token->type = token->name = daoSpaceType[ (int)token->string.mbs[0] ]; DaoLexer_AppendToken( self, token ); } } DaoToken_Delete( token ); DVector_Delete( lexenvs ); DString_Delete( source ); #if 0 for(i=0; i<self->tokens->size; i++){ DaoToken *tk = self->tokens->items.pToken[i]; printf( "%4i: %4i %4i , %4i, %s\n", i, tk->type, tk->name, tk->cpos, tk->string.mbs ); } #endif return ret ? line : 0; }
static int DaoParser_MakeMacroGroup( DaoParser *self, DMacroGroup *group, DMacroGroup *parent, int from, int to, DMap *vars, DMap *markers ) { unsigned char tk; int i, sep, rb, prev; DaoToken **toks = self->tokens->items.pToken; DMacroGroup *grp, *group2; /* mingw don't like grp2 ?! */ DMacroUnit *unit; DNode *it; /* for( i=from; i<to; i++ ) printf( "%s ", toks[i]->mbs ); printf("\n"); */ i = from; while( i < to ){ DaoToken *tok = toks[i]; char *chs = tok->string.mbs; int tk = tok->name; #if 0 printf( "%i %s\n", i, chs ); #endif self->curLine = tok->line; if( tk == DTOK_LB || tk == DTOK_LSB || tk == DTOK_LCB ){ grp = DMacroGroup_New(); grp->cpos = tok->cpos; grp->parent = parent; DArray_Append( group->units, (void*)grp ); switch( tk ){ case DTOK_LB : rb = DaoParser_FindPairToken( self, DTOK_LB, DTOK_RB, i, to ); break; case DTOK_LSB : rb = DaoParser_FindPairToken( self, DTOK_LSB, DTOK_RSB, i, to ); grp->repeat = DMACRO_ZERO_OR_ONE; break; case DTOK_LCB : rb = DaoParser_FindPairToken( self, DTOK_LCB, DTOK_RCB, i, to ); grp->repeat = DMACRO_ZERO_OR_MORE; break; default : rb = -1; DaoParser_Error( self, DAO_CTW_INV_MAC_OPEN, & tok->string ); break; } if( rb <0 ) return 0; prev = i+1; sep = DaoParser_FindOpenToken( self, DTOK_PIPE, i+1, rb, 0 ); if( sep >=0 ){ while( sep >=0 ){ group2 = DMacroGroup_New(); group2->parent = grp; if( DaoParser_MakeMacroGroup( self, group2, group2, prev, sep, vars, markers ) == 0 ) return 0; DArray_Append( grp->units, (void*)group2 ); prev = sep +1; sep = DaoParser_FindOpenToken( self, DTOK_PIPE, prev, rb, 0 ); if( prev < rb && sep <0 ) sep = rb; } grp->type = DMACRO_ALT; }else if( DaoParser_MakeMacroGroup( self, grp, grp, i+1, rb, vars, markers ) == 0 ){ return 0; } i = rb +1; tok = toks[i]; self->curLine = tok->line; switch( tok->name ){ case DTOK_NOT : grp->repeat = DMACRO_ZERO; i++; break; case DTOK_QUERY : grp->repeat = DMACRO_ZERO_OR_ONE; i++; break; case DTOK_MUL : grp->repeat = DMACRO_ZERO_OR_MORE; i++; break; case DTOK_ADD : grp->repeat = DMACRO_ONE_OR_MORE; i++; break; default : break; } continue; } self->curLine = tok->line; unit = DMacroUnit_New(); DaoToken_Assign( unit->marker, tok ); DArray_Append( group->units, (void*)unit ); if( chs[0] == '$' ){ if( DString_FindMBS( & tok->string, "EXP", 0 ) == 1 ){ unit->type = DMACRO_EXP; }else if( DString_FindMBS( & tok->string, "VAR", 0 ) == 1 ){ unit->type = DMACRO_VAR; }else if( DString_FindMBS( & tok->string, "ID", 0 ) == 1 ){ unit->type = DMACRO_ID; }else if( DString_FindMBS( & tok->string, "OP", 0 ) == 1 ){ unit->type = DMACRO_OP; }else if( DString_FindMBS( & tok->string, "BL", 0 ) == 1 ){ unit->type = DMACRO_BL; }else{ DaoParser_Error( self, DAO_CTW_INV_MAC_SPECTOK, & tok->string ); return 0; } if( vars != NULL ){ it = DMap_Find( vars, & unit->marker->string ); if( it == NULL ) it = DMap_Insert( vars, & unit->marker->string, 0 ); it->value.pInt += 1; } }else if( tk == DTOK_MBS ){ DaoLexer_Reset( self->wlexer ); DaoLexer_Tokenize( self->wlexer, chs + 1, 0 ); if( self->wlexer->tokens->size == 2 ){ DaoToken_Assign( unit->marker, self->wlexer->tokens->items.pToken[0] ); if( markers != NULL ){ it = DMap_Find( markers, & unit->marker->string ); if( it == NULL ) it = DMap_Insert( markers, & unit->marker->string, 0 ); it->value.pInt += 1; } } DaoLexer_Reset( self->wlexer ); rb = -1; if( tok->string.size == 3 ){ switch( chs[1] ){ case '(': rb = DaoParser_FindPair( self, "'('", "')'", i, to ); break; case '[': rb = DaoParser_FindPair( self, "'['", "']'", i, to ); break; case '{': rb = DaoParser_FindPair( self, "'{'", "'}'", i, to ); break; default : break; } } if( rb >= 0 ){ grp = DMacroGroup_New(); grp->parent = group; grp->repeat = DMACRO_AUTO; DArray_Append( group->units, (void*)grp ); if( DaoParser_MakeMacroGroup( self, grp, parent, i+1, rb, vars, markers ) == 0 ) return 0; i = rb; continue; } } i ++; } return 1; }