/* Note: do not free phrase, as it is used by strings (array of segment). */ bool segment(FacadePhraseTable3 * phrase_table, FacadePhraseIndex * phrase_index, GArray * current_ucs4, GArray * strings /* Array of SegmentStep. */){ ucs4_t * phrase = (ucs4_t *)current_ucs4->data; guint phrase_len = current_ucs4->len; /* Prepare for shortest path segment dynamic programming. */ GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); SegmentStep step; for ( glong i = 0; i < phrase_len + 1; ++i ){ g_array_append_val(steps, step); } SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0); first_step->m_nword = 0; PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index->prepare_tokens(tokens); for ( glong i = 0; i < phrase_len + 1; ++i ) { SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i); size_t nword = step_begin->m_nword; for ( glong k = i + 1; k < phrase_len + 1; ++k ) { size_t len = k - i; ucs4_t * cur_phrase = phrase + i; phrase_token_t token = null_token; phrase_index->clear_tokens(tokens); int result = phrase_table->search(len, cur_phrase, tokens); int num = get_first_token(tokens, token); if ( !(result & SEARCH_OK) ){ token = null_token; if ( 1 != len ) continue; } ++nword; SegmentStep * step_end = &g_array_index(steps, SegmentStep, k); if ( nword < step_end->m_nword ) { step_end->m_handle = token; step_end->m_phrase = cur_phrase; step_end->m_phrase_len = len; step_end->m_nword = nword; step_end->m_backward_nstep = i - k; } if ( !(result & SEARCH_CONTINUED) ) break; } } phrase_index->destroy_tokens(tokens); return backtrace(steps, phrase_len, strings); }
static int masm_main_loop(char * obj_file,char * src_file) { FILE * obj_fp, *src_fp; char buf[BUFSIZ] = {0}; int length; char * p, * q; src_fp = obj_fp = NULL; uint32_t counter = 0; uint32_t lines = 0; hash_table * label_hash = hash_create(512); char op_name[128]; char label[128]; char rd[6]; char rs[6]; char rt[6]; char imm[20]; int32_t rd_num,rs_num,rt_num,imm_num; if((src_fp = fopen(src_file,"r")) == NULL) { printf("Can not open %s:%s\n",src_file,strerror(errno)); } if((obj_fp = fopen(obj_file,"w+")) == NULL) { printf("Can not open %s:%s\n",obj_file,strerror(errno)); } int total_lines = get_file_lines(src_fp); uint32_t * instruction = calloc(1,total_lines * sizeof(uint32_t)); var_t * var = calloc(1,total_lines * sizeof(var_t)); int var_count = 0; fseek(src_fp,0L,SEEK_SET); while(1) { fgets(buf,BUFSIZ,src_fp); if(feof(src_fp)) { break; } lines++; length = strlen(buf); p = buf; //skip whitespace while(length > 0 &&isspace(p[0])) { length--; p++; } //printf("length=%d\t%s",length,buf+i); if(p[0] == ';' || p[0] == '\0') { continue; } q = get_first_token(p); strncpy(op_name, p , q-p); op_name[q-p] = '\0'; if(line_has_label(p)) { /* it is label */ label_t l; l.name = op_name; l.real_line = lines; l.line = counter; hash_add_item(&label_hash,str2hash(op_name),&l); p = skip_label_wthie(q); /* 获得字符串 */ q = get_opcode_token(p); strncpy(op_name, p , q-p); op_name[q-p] = '\0'; //printf("%s",op_name); } /* p now a opcode start q-p is opecode */ int op_index = verify_opcode(op_name,lines); q = skip_wthie(q); p = q; /* now at rd */ switch(op_index) { #if 1 case ADD: case SUB: case MUL: case DIV: case MOD: case AND: case OR: case NOT: case XOR: case LWORD: case SWORD: /* 获得字符串 */ q = get_reg_token(p); strncpy(rd, p , q-p); rd[q-p] = '\0'; q = skip_reg_wthie(q); p = q; q = get_reg_token(p); strncpy(rs, p , q-p); rs[q-p] = '\0'; q = skip_reg_wthie(q); p = q; q = get_reg_token(p); strncpy(rt, p , q-p); rt[q-p] = '\0'; rd_num = get_reg_index(rd,lines); rs_num = get_reg_index(rs,lines); rt_num = get_reg_index(rt,lines); instruction[counter] = (op_index << 26) | (rd_num << 21) | (rs_num << 16)| (rt_num << 11); break; ///C语言中的左移就是逻辑左移,而右移, ///对于无符号数来说就是逻辑右移, ///对有符号来说就是算术右移 ///想要实现符号左移,比较麻烦一点,首先保存最高位,然后左移之后补上最高位。 case SLL: case SLR: case SAL: case SAR: case ADDI: case ANDI: ///这里的立即数是0扩展的。 case ORI: case XORI: case LUI: ///哦,载入高16位数啊,靠,那么低位怎么载入呢?用ori q = get_reg_token(p); strncpy(rd, p , q-p); rd[q-p] = '\0'; q = skip_reg_wthie(q); p = q; q = get_reg_token(p); strncpy(rs, p , q-p); rs[q-p] = '\0'; q = skip_reg_wthie(q); p = q; q = get_reg_token(p); strncpy(imm, p , q-p); imm[q-p] = '\0'; rd_num = get_reg_index(rd,lines); rs_num = get_reg_index(rs,lines); imm_num = atoi(imm); if(imm_num > 32767 || imm_num < -32768) { printf("________\n"); printf("[ERROR 6] line: %d imm num is too lager or too smaller\n",lines); } instruction[counter] = (op_index << 26) | (rd_num << 21) | (rs_num << 16)| (imm_num & 0x0000ffff); break; case LESS: case GREAT: case LESSE: case GREATE: case LESSU: case GREATU: case LESSEU: case GREATEU: case EQUAL: case UEQUAL: q = get_reg_token(p); strncpy(rd, p , q-p); rd[q-p] = '\0'; q = skip_reg_wthie(q); p = q; q = get_reg_token(p); strncpy(rs, p , q-p); rs[q-p] = '\0'; q = skip_reg_wthie(q); p = q; q = get_reg_token(p); strncpy(label, p , q-p); label[q-p] = '\0'; rd_num = get_reg_index(rd,lines); rs_num = get_reg_index(rs,lines); var[var_count].name = malloc(strlen(label) + 1); strcpy(var[var_count].name, label); var[var_count].line = counter; var_count++; instruction[counter] = (op_index << 26) | (rd_num << 21) | (rs_num << 16) | 0x0; break; case JMP: q = get_reg_token(p); strncpy(label, p , q-p); label[q-p] = '\0'; var[var_count].name = malloc(strlen(label) + 1); strcpy(var[var_count].name, label); var[var_count].line = counter; var_count++; instruction[counter] = (op_index << 26); break; /* 存储指令 */ case MOV: q = get_reg_token(p); strncpy(rd, p , q-p); rd[q-p] = '\0'; q = skip_reg_wthie(q); p = q; q = get_reg_token(p); strncpy(rs, p , q-p); rs[q-p] = '\0'; rd_num = get_reg_index(rd,lines); rs_num = get_reg_index(rs,lines); instruction[counter] = (op_index << 26) | (rd_num << 21) | (rs_num << 16) | 0x0; break; default: break; #endif } counter++; } /* 第二趟汇编 */ struct blist * head; for(int i = 0; i < var_count; i++) { if((head = hash_lookup_item(label_hash,str2hash(var[i].name),&var[i])) != NULL) { label_t * node = head->item; int imm_2 = node->line - var[i].line; if((instruction[var[i].line] >> 26) == JMP) { if(imm_2 > 33554431 || imm_2 < -33554432) { printf("[ERROR 7] line: %d imm num is too lager or too smaller\n",lines); } instruction[var[i].line] |= imm_2 & 0x03ffffff; } else { if(imm_2 > 32767 || imm_2 < -32768) { printf("[ERROR 6] line: %d imm num is too lager or too smaller\n",lines); } instruction[var[i].line] |= imm_2 & 0x0000ffff; } }
int check_expression(Resources *res, TToken **last_token, index_t *last_index) { args_assert(res != NULL, INTERNAL_ERROR); TToken *input_token = NULL; TToken *top_token = NULL; TToken *tmp = NULL; index_t top_index = ZERO_INDEX; index_t input_index = ZERO_INDEX; TStack stack; int iRet = RETURN_OK; int return_type; init_stack(&stack); new_item(&res->struct_buff, top_index, top_token); top_token->token_type = END_OF_EXPR; push(&res->struct_buff, &stack, top_index); // $ on top of the stack if ((*last_token) != NULL) input_index = *last_index; else input_index = get_token(res->source, &res->string_buff, &res->struct_buff); catch_internal_error( dereference_structure(&res->struct_buff, input_index, (void **)&input_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); if (input_token->token_type == ERRORT) { iRet = LEXICAL_ERROR; goto EXIT; } catch_internal_error( dereference_structure(&res->struct_buff, top_index, (void **)&top_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); do { #if DEBUG print_stack(&res->struct_buff, &stack); #endif debug_print("%s %d\n", "TOP", top_token->token_type); debug_print("%s %d\n", "INPUT", input_token->token_type); if (top_token->token_type == IDENTIFIER && input_token->token_type == OPENING_BRACKET) { debug_print("%s\n", "FUNCTION CALL IN EXPR"); index_t last_id = top_token->token_index; catch_undefined_error(is_func_declared(res, last_id), SEMANTIC_ERROR, "Function declaration check failed.", 1 ); dereference_structure(&res->struct_buff, input_index, (void **)last_token); if ((iRet = generate_function_call(res, last_id)) != 0) goto EXIT; return_type = get_return_type(res, top_token->token_index); catch_internal_error(return_type, SYNTAX_ERROR, "Failed to get function return type."); // Reduction of function call if((iRet = reduce(&res->struct_buff, &stack, return_type)) != RETURN_OK) goto EXIT; top_index = stack.top; catch_syntax_error( get_first_token(&res->struct_buff, &stack, &top_index), INTERNAL_ERROR, "Failed to get first token", 1 ); input_index = get_token(res->source, &res->string_buff, &res->struct_buff); catch_internal_error( dereference_structure(&res->struct_buff, input_index, (void **)&input_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); if (input_token->token_type == ERRORT) { iRet = LEXICAL_ERROR; goto EXIT; } catch_internal_error( dereference_structure(&res->struct_buff, top_index, (void **)&top_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); if (type_filter(top_token->token_type) == END_OF_EXPR && type_filter(input_token->token_type) == END_OF_EXPR) break; } switch(precedence_table[type_filter(top_token->token_type)] [type_filter(input_token->token_type)]) { case H: debug_print("%s\n", "CASE H"); top_index = input_index; push(&res->struct_buff, &stack, top_index); input_index = get_token(res->source, &res->string_buff, &res->struct_buff); catch_internal_error( dereference_structure(&res->struct_buff, input_index, (void **)&input_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); if (input_token->token_type == ERRORT) { iRet = LEXICAL_ERROR; goto EXIT; } catch_internal_error( dereference_structure(&res->struct_buff, top_index, (void **)&top_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); break; case S: debug_print("%s\n", "CASE S"); new_item(&res->struct_buff, top_index, top_token); catch_internal_error( dereference_structure(&res->struct_buff, stack.top, (void **)&tmp), INTERNAL_ERROR, "Failed to dereference structure buffer." ); top_token->token_type = SHIFT; if (tmp->token_type == RVALUE) { index_t rvalue_index = stack.top; pop(&res->struct_buff, &stack); push(&res->struct_buff, &stack, top_index); push(&res->struct_buff, &stack, rvalue_index); } else push(&res->struct_buff, &stack, top_index); catch_internal_error( dereference_structure(&res->struct_buff, input_index, (void **)&input_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); top_index = input_index; push(&res->struct_buff, &stack, top_index); input_index = get_token(res->source, &res->string_buff, &res->struct_buff); catch_internal_error( dereference_structure(&res->struct_buff, input_index, (void **)&input_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); if (input_token->token_type == ERRORT) { iRet = LEXICAL_ERROR; goto EXIT; } catch_internal_error( dereference_structure(&res->struct_buff, top_index, (void **)&top_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); break; case R: debug_print("%s\n", "CASE R"); if ((iRet = get_rule(res, &stack)) != RETURN_OK) goto EXIT; top_index = stack.top; catch_syntax_error( get_first_token(&res->struct_buff, &stack, &top_index), INTERNAL_ERROR, "Failed to get first token", 1 ); catch_internal_error( dereference_structure(&res->struct_buff, top_index, (void **)&top_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); break; case E: debug_print("%s\n", "CASE E"); if (type_filter(top_token->token_type) == END_OF_EXPR && type_filter(input_token->token_type) == CLOSING_BRACKET) { catch_internal_error( dereference_structure(&res->struct_buff, input_index, (void **)last_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); catch_internal_error( dereference_structure(&res->struct_buff, stack.top, (void **)&top_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); if (top_token->original_type == 0) { // Empty expression, there was nothing reduced on top debug_print("%s: %d\n", "EMPTY EXPRESSION RETURN", SYNTAX_ERROR); iRet = SYNTAX_ERROR; goto EXIT; } goto FINISH; } iRet = SYNTAX_ERROR; goto EXIT; default: debug_print("%s", "DEFAULT\n"); iRet = INTERNAL_ERROR; goto EXIT; } } while (type_filter(top_token->token_type) != END_OF_EXPR || type_filter(input_token->token_type) != END_OF_EXPR); catch_internal_error( dereference_structure(&res->struct_buff, input_index, (void **)last_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); catch_internal_error( dereference_structure(&res->struct_buff, stack.top, (void **)&top_token), INTERNAL_ERROR, "Failed to dereference structure buffer." ); FINISH: debug_print("%s: %d\n", "TYPE OF EXPRESSION", top_token->original_type); // send type of expression back to syntax_analysis (*last_token)->original_type = top_token->original_type; // set type of stack top on runtime stack catch_internal_error(new_instruction_int_int(&res->instruction_buffer, 0lu, top_token->original_type, 0, SET_TYPE), INTERNAL_ERROR, "Failed to generate new instruction"); EXIT: debug_print("%s: %d\n", "RETURN", iRet); return iRet; }
int main(int argc, char * argv[]){ int i = 1; bool train_pi_gram = true; const char * bigram_filename = "bigram.db"; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0){ print_help(); exit(0); }else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ train_pi_gram = false; }else if ( strcmp("--bigram-file", argv[i]) == 0){ if ( ++i >= argc ) { print_help(); exit(EINVAL); } bigram_filename = argv[i]; }else{ print_help(); exit(EINVAL); } ++i; } PhraseLargeTable2 phrase_table; /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; linebuf[strlen(linebuf)-1] = '\0'; glong phrase_len = 0; ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); phrase_token_t token = null_token; if ( 0 != phrase_len ) { phrase_index.clear_tokens(tokens); int result = phrase_table.search(phrase_len, phrase, tokens); int num = get_first_token(tokens, token); if ( !(result & SEARCH_OK) ) token = null_token; g_free(phrase); phrase = NULL; } last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* training uni-gram */ phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; /* increase freq */ if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); /* increase total freq */ single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } phrase_index.destroy_tokens(tokens); free(linebuf); if (!save_phrase_index(&phrase_index)) exit(ENOENT); return 0; }
int main(int argc, char * argv[]){ const char * evals_text = "evals.text"; pinyin_option_t options = USE_TONE; FacadeChewingTable largetable; MemoryChunk * chunk = new MemoryChunk; chunk->load("pinyin_index.bin"); largetable.load(options, chunk, NULL); FacadePhraseTable2 phrase_table; chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk, NULL); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); Bigram system_bigram; system_bigram.attach("bigram.db", ATTACH_READONLY); Bigram user_bigram; user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE); PinyinLookup2 pinyin_lookup(options, &largetable, &phrase_index, &system_bigram, &user_bigram); /* open evals.text. */ FILE * evals_file = fopen(evals_text, "r"); if ( NULL == evals_file ) { fprintf(stderr, "Can't open file:%s\n", evals_text); exit(ENOENT); } PhraseTokens phrase_tokens; memset(phrase_tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(phrase_tokens); /* Evaluates the correction rate of test text documents. */ size_t tested_count = 0; size_t passed_count = 0; char* linebuf = NULL; size_t size = 0; TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t)); phrase_token_t token = null_token; while( getline(&linebuf, &size, evals_file) ) { if ( feof(evals_file) ) break; if ( '\n' == linebuf[strlen(linebuf)-1] ) linebuf[strlen(linebuf)-1] = '\0'; glong phrase_len = 0; ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); token = null_token; if ( 0 != phrase_len ) { int result = phrase_table.search(phrase_len, phrase, phrase_tokens); int num = get_first_token(phrase_tokens, token); if ( !(result & SEARCH_OK) ) token = null_token; g_free(phrase); phrase = NULL; } if ( null_token == token ) { if ( tokens->len ) { /* one test. */ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) { tested_count ++; passed_count ++; } else { tested_count ++; } g_array_set_size(tokens, 0); } } else { g_array_append_val(tokens, token); } } if ( tokens->len ) { /* one test. */ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) { tested_count ++; passed_count ++; } else { tested_count ++; } } parameter_t rate = passed_count / (parameter_t) tested_count; printf("correction rate:%f\n", rate); g_array_free(tokens, TRUE); fclose(evals_file); free(linebuf); phrase_index.destroy_tokens(phrase_tokens); return 0; }