static tdb_error find_duplicate_fieldnames(const char **ofield_names, uint64_t num_ofields) { Pvoid_t check = NULL; tdb_field i; Word_t tmp; #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-compare" for (i = 0; i < num_ofields; i++){ Word_t *ptr; JSLI(ptr, check, (const uint8_t*)ofield_names[i]); if (*ptr){ JSLFA(tmp, check); return TDB_ERR_DUPLICATE_FIELDS; } *ptr = 1; } JSLFA(tmp, check); #pragma GCC diagnostic pop return 0; out_of_memory: return TDB_ERR_NOMEM; }
void create_index(char *filepath) { PWord_t PValue; // Judy array element. Word_t Bytes; // size of JudySL array. char buffer[MAXLINE]; // string to insert FILE *fp; fp = fopen(filepath, "r"); if (fp == NULL) { perror ("db open failure"); return; } while(fgets(buffer, sizeof(buffer), fp) != NULL) { remove_eol(buffer); JSLI(PValue, PJArray, buffer); // store string into array if (PValue == PJERR) // if out of memory? { // so do something printf("Malloc failed -- get more ram\n"); exit(1); } ++(*PValue); // count instances of string } fclose(fp); }
Pvoid_t read_parameters() { Pvoid_t params = NULL; unsigned int len, bytes = 0; unsigned char tmp; if (!fscanf(stdin, "%u", &len)) die("Couldn't parse parameter set size"); /* Read a newline after the size spec. Earlier I did * fscanf(stdin, "%u\n", &len) * on the line above, but this was a stupid idea. Fscanf interpretes * *any* whitespace character as a sign to read *any number* of * following whitespace characters, which obviously caused great havoc * here. * */ fread(&tmp, 1, 1, stdin); while (bytes < len){ p_entry *key = read_netstr_entry(&bytes); p_entry *val = read_netstr_entry(&bytes); Word_t *ptr; JSLI(ptr, params, (unsigned char*)key->data); *ptr = (Word_t)val; free(key); } if (bytes > len) die("Invalid parameter set size"); return params; }
void *jtableS_insert(jtableS *table, const char *key, void *data) { PWord_t PValue; JSLI(PValue, table->t, (const uint8_t*)key); *PValue = (Word_t)data; return data; }
/* Insert a non-null-terminated string and a value the size of a machine word. Return 0 on success. */ int insert(char *string, int len, int value) { Word_t *pvalue; TO_INDEX(string, len); JSLI(pvalue, trie, Index); if (pvalue == NULL) return -1; *pvalue = (Word_t)value; return 0; }
int assoc_insert(char *key, void *value) { Word_t *PValue; JSLI( PValue, PJSLArray, key); if (PValue) { *PValue = (Word_t) value; return 1; } else return 0; }
static Pvoid_t add_token(char *token, Pvoid_t features) { Word_t *PValue; int toklen = strlen(token); char *feature = malloc(toklen * sizeof(char) + 3); strncpy(feature, "t:", 3); strncat(feature, token, toklen + 3); JSLI(PValue, features, (unsigned char*) feature); (*PValue)++; free(feature); return features; }
int copy_judySL_typed(judys_llref* src, judys_llref** dest) { // use the assignment operator **dest = *src; #if 0 *dest = 0; // default value, if we don't finish Pvoid_t newJArray = 0; // new JudyL array to ppopulate Word_t * PValue = 0; // pointer to array element value Word_t * PValueIns = 0; // pointer to array element value volatile int complete = 0; complete = 0; uint8_t Index[BUFSIZ]; // string to sort. Index[0] = '\0'; // start with smallest string. XTRY case XCODE: JSLF(PValue, src, Index); if (PValue == PJERR) { l3throw(XARRAYDUP_FAILURE); } while (PValue != NULL) { DV(printf("%s -> %.06f\n", Index, *((double*)PValue))); JSLI(PValueIns, newJArray, Index); if (PValueIns == PJERR) { l3throw(XARRAYDUP_FAILURE); } *PValueIns = *PValue; JSLN(PValue, src, Index); if (PValue == NULL) { complete = 1; break; } } break; case XFINALLY: if (complete) { *dest = newJArray; } break; XENDX #endif return 0; }
Pvoid_t judify_ixicon() { Pvoid_t ixicon = NULL; uint i; for (i = 0; i < ixi_cnt; i++){ Word_t *xid = NULL; JSLI(xid, ixicon, &ixi_body[ixi_idx[i]]); *xid = *(u32*)&ixi_body[ixi_idx[i + 1] - 4]; } return ixicon; }
void copy_sites(Pvoid_t judy) { uint i; for (i = 0; i < ixi_cnt; i++){ if (index(&ixi_body[ixi_idx[i]], '.')){ u32 *xid = NULL; JSLI(xid, judy, &ixi_body[ixi_idx[i]]); *xid = *(u32*)&ixi_body[ixi_idx[i + 1] - 4]; } } }
static Pvoid_t add_url_component(char * uri, Pvoid_t features) { if (uri) { Word_t *PValue; int size = strlen(uri) + 8; char * token = malloc(sizeof(char) * size); strncpy(token, "URLSeg:", size); strncat(token, uri, size); JSLI(PValue, features, (unsigned char*) token); (*PValue)++; free(token); } return features; }
void db_filelist_add_pkg_paths(const struct db_pkg* pkg) { gchar path[MAXPATHLEN]; Word_t *p1, *p2; strcpy(path, ""); JSLF(p1, pkg->paths, path); while (p1 != NULL) { JSLI(p2, _db.paths, path); (*p2)++; JSLN(p1, pkg->paths, path); } }
char *test_judy() { Pvoid_t PJArray = (PWord_t)NULL; PWord_t PValue; Word_t Bytes; char dude[10]; char s[256]; strcpy(s, "one two three one 1234567890"); char *token = strtok(s, " "); while (token) { JSLI(PValue, PJArray, token); if (PValue == PJERR) { printf("malloc failed\n"); exit(1); } *PValue += 1; token = strtok(NULL, " "); } dude[0] = '\0'; JSLF(PValue, PJArray, dude); while(PValue != NULL) { printf("%s %d\n", dude, *PValue); JSLN(PValue, PJArray, dude); } JSLFA(Bytes, PJArray); printf("%lu bytes used\n", Bytes); return NULL; }
void yogo_define_function(YogoInterp *interp, YogoPackage *cls, const char *name, YogoFunction *func) { PPvoid_t v; JSLI(v, cls->functions, (const uint8_t *) name); *v = func; }
int main(int argc, char *argv[]) { if (argc != 8) { fprintf(stderr, "Usage: ./merge_N [out_dir_name 1] [item_#_file 2] [length_#_file 3] [count_dist_file 4] [number_of_temp_prefix 5] [from_temp_n] [to_temp_n]\n"); // /root/cx_src/src/merge_N /tmp/data /tmp/result/ino.txt /tmp/result/lno.txt /tmp/result/cdo.txt 3 0 1 return -1; } if ((itf = fopen(argv[2], "a")) == NULL) { fprintf(stderr, "Failed to open file \"%s\" for writing item numbers\n", argv[2]); return -1; } // fixing each temp file fprintf(stdout, "Start fixing temp files\n"); for (i = atoi(argv[6]); i <= atoi(argv[7]); ++i) { //for (i = 0; i < TEMP_N; ++i) { // fix the temp file i int temp_prefix_num = 0;// num of concurrent threads while (temp_prefix_num < atoi(argv[5])){ sprintf(buffer, "%s/%s%d-%d.txt", argv[1], TEMP_PREFIX, i, temp_prefix_num); fprintf(stdout, "\rWorking on temp file: \"%s\" \n", buffer); //fflush(stdout); if ((tsf[i] = fopen(buffer, "r")) == NULL) { fprintf(stderr, "Failed to open file \"%s\" for reading temp strings\n", buffer); //continue; //break; return -1; } while (fscanf(tsf[i], "%"PRId64"\t", &itemn) != EOF) { fgets(Index, BUFFER_SIZE, tsf[i]); for (Len = strlen(Index) - 1; Index[Len] == '\n' || Index[Len] == '\r'; Len--) Index[Len] = 0; ++Len; JSLI(PValNgramS, PJSLNgram, (uint8_t *)Index); if (PValNgramS == PJERR) { fprintf(stderr, "Malloc failed for \"PJSLNgram\"\n"); //return -1; } (*PValNgramS) += itemn; JLI(PValTotC, PJLTotCount, Len); if (PValTotC == PJERR) { fprintf(stderr, "Malloc failed for \"PJLTotCount\"\n"); //return -1; } *PValTotC += itemn; if (*PValNgramS == itemn) { JLI(PValNgramC, PJLNgramCount, Len); if (PValNgramC == PJERR) { fprintf(stderr, "Malloc failed for \"PJLNgramCount\"\n"); //return -1; } ++*PValNgramC; } } sprintf(buffer, "rm %s/%s%d-%d.txt", argv[1], TEMP_PREFIX, i, temp_prefix_num++); if (system(buffer) == -1) { fprintf(stderr, "Failed to execute command: \"%s\"\n", buffer); //return -1; } fclose(tsf[i]); } // write the final temp file sprintf(buffer, "%s/%s%d.txt", argv[1], TEMP_PREFIX, i); if ((tsf[i] = fopen(buffer, "w")) == NULL) { fprintf(stderr, "Failed to open file \"%s\" for writing temp strings\n", buffer); return -1; } Index[0] = '\0'; JSLF(PValNgramS, PJSLNgram, (uint8_t *)Index); while (PValNgramS != NULL) { fprintf(tsf[i], "%lu\t%s\n", *PValNgramS, Index); Count = *PValNgramS; JLI(PValCountC, PJLCountCount, Count); if (PValCountC == PJERR) { fprintf(stderr, "Malloc failed for \"PJLCountCount\"\n"); return -1; } ++*PValCountC; JSLN(PValNgramS, PJSLNgram, (uint8_t *)Index); } JSLFA(Bytes, PJSLNgram); fflush(tsf[i]); fclose(tsf[i]); fprintf(itf, "Temp file \"%s/%s%d\" uses %lu Bytes of memory\n", argv[1], TEMP_PREFIX, i, Bytes); fflush(itf); } fclose(itf); if ((lef = fopen(argv[3], "a")) == NULL) { fprintf(stderr, "Failed to open file \"%s\" for writing length number\n", argv[3]); return -1; } Total = NgramN = 0; JLF(PValTotC, PJLTotCount, Total); JLF(PValNgramC, PJLNgramCount, NgramN); while (PValTotC != NULL) { fprintf(lef, "%lu\t%lu\t%lu\n", Total, *PValNgramC, *PValTotC); JLN(PValTotC, PJLTotCount, Total); JLN(PValNgramC, PJLNgramCount, NgramN); } JLFA(Bytes, PJLTotCount); JLFA(Bytes, PJLNgramCount); fflush(lef); fclose(lef); if ((cdf = fopen(argv[4], "a")) == NULL) { fprintf(stderr, "Failed to open file \"%s\" for writing count distribuction\n", argv[4]); return -1; } Count = 0; JLF(PValCountC, PJLCountCount, Count); while (PValCountC != NULL) { fprintf(cdf, "%lu\t%lu\n", Count, *PValCountC); JLN(PValCountC, PJLCountCount, Count); } JLFA(Bytes, PJLCountCount); fflush(cdf); fclose(cdf); return 0; }
int main(int argc, char *argv[]) { if (argc != 7) { fprintf(stderr, "Usage: ./prepare_N [file_prefix] [from_file_num] [number_of_files] [out_dir_name] [item_#_file] [temp_prefix_n]\n"); // /root/cx_src/src/prepare_N /tmp/test/googlebooks-eng-all-5gram-20090715- 0 200 /tmp/data /tmp/result/ino.txt 0 return -1; } // check and set maximun number of newly-created descriptors getrlimit(RLIMIT_NOFILE, &rlim); if (rlim.rlim_cur < FLIMIT) { fprintf(stderr, "Maximum number of newly-created descriptors \"%"PRId64"\" is not enough\n", rlim.rlim_cur); rlim.rlim_cur = rlim.rlim_max = FLIMIT; if (setrlimit(RLIMIT_NOFILE, &rlim)) { fprintf(stderr, "Failed to set the maximum number of newly-created descriptors\n"); return -1; } } if (access(argv[4], F_OK)) { fprintf(stdout, "Directory \"%s\" does not exist, create it\n", argv[4]); if (mkdir(argv[4], S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)) { fprintf(stderr, "Failed to create directory \"%s\"\n", argv[4]); return -1; } } else fprintf(stdout, "Directory \"%s\" already existed\n", argv[4]); int temp_prefix_num = atoi(argv[6]);// num of concurrent threads for (i = 0; i < TEMP_N; ++i) { sprintf(buffer, "%s/%s%d-%d.txt", argv[4], TEMP_PREFIX, i, temp_prefix_num); if ((tsf[i] = fopen(buffer, "w")) == NULL) { fprintf(stderr, "Failed to open file \"%s\" for writing temp strings\n", buffer); for (j = 0; j < i; ++j) fclose(tsf[j]); return -1; } } if ((itf = fopen(argv[5], "a")) == NULL) { fprintf(stderr, "Failed to open file \"%s\" for writing item numbers\n", argv[5]); return -1; } fi = atoi(argv[2]); fn = fi + atoi(argv[3]); for (tot = itemn = Index[0] = 0; fi < fn; ++fi) { //for (fi = tot = itemn = Index[0] = 0; fi < fn; ++fi) { // use "unzip" command to prepare csv input file sprintf(buffer, "unzip %s%d.csv.zip -d /tmp/temp_csv_files/ 1>/dev/null", argv[1], fi); if (system(buffer) == -1) { fprintf(stderr, "Failed to execute command: \"%s\"\n", buffer); return -1; } sprintf(buffer, "/tmp/temp_csv_files/%s%d.csv", "googlebooks-eng-all-5gram-20090715-", fi); if ((inf = fopen(buffer, "r")) == NULL) { fprintf(stderr, "Failed to read file \"%s%d\"\n", "googlebooks-eng-all-5gram-20090715-", fi); return -1; } while (fgets(buffer, BUFFER_SIZE, inf) != (char *)NULL) { // data line format: "ngram TAB year TAB match_count TAB volume_count NEWLINE" // take the first TAB as the end of the ngram, and count the number of TAB for (Len = 0; buffer[Len] && buffer[Len] != '\t'; Len++) Index[Len] = buffer[Len]; Index[Len] = 0; for (i = Len, j = 0; buffer[i]; j += (buffer[i++] == '\t')); if (j != 4 || Len <= 1) continue; sscanf(buffer + Len + 1, "%d\t%d", &i, &j); JSLI(PValNgramS, PJSLNgram, (uint8_t *)Index); if (PValNgramS == PJERR) { fprintf(stderr, "Malloc failed for \"PJSLNgram\"\n"); return -1; } *PValNgramS += j; itemn += (*PValNgramS == j); tot += j; if (itemn % 200000 == 0) { fprintf(stdout, "\r%"PRId64" in %"PRId64" of %d", itemn, tot, fi); fflush(stdout); } } fclose(inf); // remove the unziped csv file sprintf(buffer, "rm /tmp/temp_csv_files/%s%d.csv", "googlebooks-eng-all-5gram-20090715-", fi); if (system(buffer) == -1) { fprintf(stderr, "Failed to execute command: \"%s\"\n", buffer); return -1; } fprintf(itf, "%"PRId64" in %"PRId64" of %d\n", itemn, tot, fi); fflush(itf); Index[0] = '\0'; JSLF(PValNgramS, PJSLNgram, (uint8_t *)Index); while (PValNgramS != NULL) { fprintf(tsf[BKDRHash((uint8_t *)Index)], "%lu\t%s\n", *PValNgramS, Index); JSLN(PValNgramS, PJSLNgram, (uint8_t *)Index); } JSLFA(Bytes, PJSLNgram); } for (i = 0; i < TEMP_N; ++i) { fflush(tsf[i]); fclose(tsf[i]); } fprintf(stdout, "\r%"PRId64" in %"PRId64" of %d\n", itemn, tot, fi); return 0; }
/* Synthesize a CALL_EXPR and a TRY_FINALLY_EXPR, for this chain of _DECLs if appropriate. Arrange to call the __mf_register function now, and the __mf_unregister function later for each. Return the gimple sequence after synthesis. */ gimple_seq mx_register_decls (tree decl, gimple_seq seq, gimple stmt, location_t location, bool func_args) { gimple_seq finally_stmts = NULL; gimple_stmt_iterator initially_stmts = gsi_start (seq); bool sframe_inserted = false; size_t front_rz_size, rear_rz_size; tree fsize, rsize, size; gimple uninit_fncall_front, uninit_fncall_rear, init_fncall_front, \ init_fncall_rear, init_assign_stmt; tree fncall_param_front, fncall_param_rear; int map_ret; while (decl != NULL_TREE) { if ((mf_decl_eligible_p (decl) || TREE_CODE(TREE_TYPE(decl)) == ARRAY_TYPE) /* Not already processed. */ && ! mf_marked_p (decl) /* Automatic variable. */ && ! DECL_EXTERNAL (decl) && ! TREE_STATIC (decl) && get_name(decl)) { DEBUGLOG("DEBUG Instrumenting %s is_complete_type %d\n", IDENTIFIER_POINTER(DECL_NAME(decl)), COMPLETE_TYPE_P(decl)); /* construct a tree corresponding to the type struct{ unsigned int rz_front[6U]; original variable unsigned int rz_rear[6U]; }; */ if (!sframe_inserted){ gimple ensure_fn_call = gimple_build_call (lbc_ensure_sframe_bitmap_fndecl, 0); gimple_set_location (ensure_fn_call, location); gsi_insert_before (&initially_stmts, ensure_fn_call, GSI_SAME_STMT); sframe_inserted = true; } // Calculate the zone sizes size_t element_size = 0, request_size = 0; if (COMPLETE_TYPE_P(decl)){ request_size = TREE_INT_CST_LOW(TYPE_SIZE_UNIT(TREE_TYPE(decl))); if (TREE_CODE(TREE_TYPE(decl)) == ARRAY_TYPE) element_size = TREE_INT_CST_LOW(TYPE_SIZE_UNIT(TREE_TYPE(TREE_TYPE(decl)))); else element_size = request_size; } calculate_zone_sizes(element_size, request_size, /*global*/ false, COMPLETE_TYPE_P(decl), &front_rz_size, &rear_rz_size); DEBUGLOG("DEBUG *SIZES* req_size %u, ele_size %u, fsize %u, rsize %u\n", request_size, element_size, front_rz_size, rear_rz_size); tree struct_type = create_struct_type(decl, front_rz_size, rear_rz_size); tree struct_var = create_struct_var(struct_type, decl, location); declare_vars(struct_var, stmt, 0); /* Inserting into hashtable */ PWord_t PV; JSLI(PV, decl_map, mf_varname_tree(decl)); gcc_assert(PV); *PV = (PWord_t) struct_var; fsize = convert (unsigned_type_node, size_int(front_rz_size)); gcc_assert (is_gimple_val (fsize)); tree rz_front = TYPE_FIELDS(struct_type); fncall_param_front = mf_mark (build1 (ADDR_EXPR, ptr_type_node, build3 (COMPONENT_REF, TREE_TYPE(rz_front), struct_var, rz_front, NULL_TREE))); uninit_fncall_front = gimple_build_call (lbc_uninit_front_rz_fndecl, 2, fncall_param_front, fsize); init_fncall_front = gimple_build_call (lbc_init_front_rz_fndecl, 2, fncall_param_front, fsize); gimple_set_location (init_fncall_front, location); gimple_set_location (uninit_fncall_front, location); // In complete types have only a front red zone if (COMPLETE_TYPE_P(decl)){ rsize = convert (unsigned_type_node, size_int(rear_rz_size)); gcc_assert (is_gimple_val (rsize)); tree rz_rear = DECL_CHAIN(DECL_CHAIN(TYPE_FIELDS (struct_type))); fncall_param_rear = mf_mark (build1 (ADDR_EXPR, ptr_type_node, build3 (COMPONENT_REF, TREE_TYPE(rz_rear), struct_var, rz_rear, NULL_TREE))); init_fncall_rear = gimple_build_call (lbc_init_rear_rz_fndecl, 2, fncall_param_rear, rsize); uninit_fncall_rear = gimple_build_call (lbc_uninit_rear_rz_fndecl, 2, fncall_param_rear, rsize); gimple_set_location (init_fncall_rear, location); gimple_set_location (uninit_fncall_rear, location); } // TODO Do I need this? #if 0 if (DECL_INITIAL(decl) != NULL_TREE){ // This code never seems to be getting executed for somehting like int i = 10; // I have no idea why? But looking at the tree dump, seems like its because // by the time it gets here, these kind of statements are split into two statements // as int i; and i = 10; respectively. I am leaving it in just in case. tree orig_var_type = DECL_CHAIN(TYPE_FIELDS (struct_type)); tree orig_var_lval = mf_mark (build3 (COMPONENT_REF, TREE_TYPE(orig_var_type), struct_var, orig_var_type, NULL_TREE)); init_assign_stmt = gimple_build_assign(orig_var_lval, DECL_INITIAL(decl)); gimple_set_location (init_assign_stmt, location); } #endif if (gsi_end_p (initially_stmts)) { // TODO handle this if (!DECL_ARTIFICIAL (decl)) warning (OPT_Wmudflap, "mudflap cannot track %qE in stub function", DECL_NAME (decl)); } else { #if 0 // Insert the declaration initializer if (DECL_INITIAL(decl) != NULL_TREE) gsi_insert_before (&initially_stmts, init_assign_stmt, GSI_SAME_STMT); #endif //gsi_insert_before (&initially_stmts, register_fncall, GSI_SAME_STMT); gsi_insert_before (&initially_stmts, init_fncall_front, GSI_SAME_STMT); if (COMPLETE_TYPE_P(decl)) gsi_insert_before (&initially_stmts, init_fncall_rear, GSI_SAME_STMT); /* Accumulate the FINALLY piece. */ //gimple_seq_add_stmt (&finally_stmts, unregister_fncall); gimple_seq_add_stmt (&finally_stmts, uninit_fncall_front); if (COMPLETE_TYPE_P(decl)) gimple_seq_add_stmt (&finally_stmts, uninit_fncall_rear); } mf_mark (decl); } decl = DECL_CHAIN (decl); } /* Actually, (initially_stmts!=NULL) <=> (finally_stmts!=NULL) */ if (finally_stmts != NULL) { gimple stmt = gimple_build_try (seq, finally_stmts, GIMPLE_TRY_FINALLY); gimple_seq new_seq = gimple_seq_alloc (); gimple_seq_add_stmt (&new_seq, stmt); return new_seq; } else return seq; }
void map_set(void ** map, char * key, void * val) { PWord_t v; JSLI(v, *map, key); *v = (int)val; }
int main(int argc, char **argv) { Pvoid_t rixicon = NULL; Pvoid_t new_ixi = NULL; Pvoid_t lang_freq = NULL; Word_t *f = NULL; Word_t idx = 0; uint freq_lim = 0; uint i, j, k; const char *ix_name = NULL; const char *stats_name = NULL; dub_init(); stats_name = pparm_common_name("istats"); ix_name = pparm_common_name("ixi"); PPARM_INT(freq_lim, NOF_FREQUENT); if (freq_lim >= XID_TOKEN_FREQUENT_L) dub_die("NOF_FREQUENT must be less than %u", XID_TOKEN_FREQUENT_L); load_ixicon(ix_name); load_istats(stats_name); dub_msg("Overwriting old %s and %s", ix_name, stats_name); qsort(istats, istats_len, sizeof(struct istat_entry), fix_cmp); rixicon = reverse_ixicon(); dub_dbg("ISTATS_LEN %u", istats_len); for (k = 0, j = 0, i = 0; i < istats_len; i++){ Word_t *val = NULL; idx = istats[i].xid; if (idx > XID_TOKEN_L) continue; JLG(val, rixicon, idx); if (!val) dub_die("No matching ixeme for ID %u", idx); JSLI(f, new_ixi, (const char*)*val); JLI(val, lang_freq, (Word_t)istats[i].lang_code); if (*val < freq_lim){ ++*val; *f = ++j; if (j >= XID_TOKEN_FREQUENT_L) dub_die("Too many frequent ixemes. " "Consider lowering NOF_FREQUENT (now %u)", freq_lim); }else *f = XID_TOKEN_FREQUENT_L + ++k; istats[i].xid = *f; } copy_sites(new_ixi); create_ixicon(ix_name, new_ixi); close_istats(); return 0; }
int main(int argc, char **argv) { const char *ixicon_file = NULL; const char *istats_file = NULL; const char *qexp_file = NULL; Pvoid_t rixicon = NULL; Pvoid_t lemmas = NULL; Pvoid_t qexp = NULL; uint i; uint nl = 0; uint ul = 0; u32 xid = 0; u32 freq_xid = 0; uint not_qexp = 0; dub_init(); ixicon_file = pparm_common_name("ixi"); istats_file = pparm_common_name("istats"); qexp_file = pparm_common_name("qexp"); PPARM_INT(max_len, MAX_LEMMA_LEN); if (getenv("LOCALE")){ if(!setlocale(LC_ALL, getenv("LOCALE"))) dub_sysdie("Couldn't set locale %s", getenv("LOCALE")); else dub_msg("Locale set to %s", getenv("LOCALE")); } /* qexp_file might not exist; no problem */ not_qexp = load_qexp(qexp_file, 1); load_ixicon(ixicon_file); rixicon = reverse_ixicon(); load_istats(istats_file); snowball_init(); for (i = 0; i < istats_len; i++){ Word_t idx = (Word_t)istats[i].xid; Word_t *val = NULL; const char *lemma = NULL; const char *token = NULL; u32 lid = 0; if (idx > XID_TOKEN_L) continue; JLG(val, rixicon, idx); if (!val) dub_die("Ixicon doesn't contain xid %u", istats[i].xid); token = (const char*)*val; if (istats[i].lang_code < XID_META_LANG_F || istats[i].lang_code > XID_META_LANG_L) continue; if (!lemmatizable(token)){ dub_dbg("Not lemmatizable: %s", token); continue; } lemma = snowball_lemmatize(token, istats[i].lang_code); /* no lemma found (unknown language etc.) */ if (!lemma) continue; JSLI(val, lemmas, lemma); if (!*val){ if (strlen(lemma) > longest_lemma) longest_lemma = strlen(lemma); glist *lst = xmalloc(sizeof(glist) + 4); lst->len = 1; /* unseen lemma */ /* if a token is frequent, its lemma is also */ if (idx < XID_TOKEN_FREQUENT_L){ lid = XID_META_FREQUENT_F + freq_xid++; if (freq_xid >= XID_META_FREQUENT_L) /* freq_ixicon has failed somehow */ dub_die("Too many frequent ixemes"); }else lid = XID_META_FREQUENT_L + xid++; if (lid > XID_META_LEMMA_L) dub_die("Lemma ID range exhausted."); lst->lst[0] = lid; *val = (Word_t)lst; ++ul; } lid = *val; JLI(val, qexp, idx); *val = lid; ++nl; } if (not_qexp){ dub_msg("Qexp file %s doesn't exist. " "Creating qexp from the scratch.", qexp_file); }else{ dub_msg("Qexp file %s exists. Lemmas will be merged to it.", qexp_file); qexp = qexp_merge(qexp); close_qexp(); } create_qexp(qexp_file, qexp); if (getenv("OUTPUT_LEMMAS")) output_lemmas(lemmas); dub_msg("%u / %u ixemes were lemmatized", nl, istats_len); dub_msg("%u different lemmas were found", ul); return 0; }