int index_file(char *filename) { char *word; struct postinglist *p; ansi_filter(filename); if (g_len != 0) { fprintf(stderr, "%d indexing %s\n", g_docid, filename); /* @TODO: save to cache file */ g_pos = 0; while ((word = next_token())) { //DEBUG1("%s", word); p = get_postinglist(g_bucket, g_size, word); if (p->freq == p->size) /* is full */ double_postinglist(p); addto_postinglist(p, g_docid); } g_data.size = strlen(filename) + 1; g_data.data = filename; /* write_docid2path */ g_dbp->put(g_dbp, NULL, &g_key, &g_data, 0); g_docid++; } return 0; }
int main(int argc, char *argv[]) { char * t = ansi_filter(argv[1]); FILE *fp = fopen("output.txt", "w"); fwrite(t, g_len, 1, fp); free(t); fclose(fp); return 0; }
void print_stat(const hash_t *ht, top_t **tops, int type) { char file[HOMELEN]; sprintf(file, BASEPATH"/%s", files[type]); FILE *fp = fopen(file, "w+"); fprintf(fp, " \033[1;34m-----\033[37m=====\033[41m" " 本%s大热门话题 \033[40m=====\033[34m-----\033[m\n\n", titles[type]); top_t *top; int i; int limit = limits[type] < ht->count ? limits[type] : ht->count; char date[32]; char title[sizeof(top->title)]; for (i = 0; i < limit; ++i) { top = tops[i]; strlcpy(date, ctime(&top->last) + 4, 16); fprintf(fp, "\033[1;37m第\033[31m%3u\033[37m 名 \033[37m信区 : \033[33m" "%-18s\033[37m〖 \033[32m%s\033[37m 〗\033[36m%4d \033[37m篇" "\033[33m%13.13s\n \033[37m标题 : \033[1;44m%-60.60s" "\033[40m\n", i + 1, top->board, date, top->count, top->owner, ansi_filter(title, top->title)); } fclose(fp); }
/**1 output_file = new file() 2 dict = new hash() 3 while (free memory available) 4 do token = next_token() 5 if token not in dict 6 postinglist = addtodict(dict, token) 7 else postinglist = getpostinglist(dict, token) 8 if full(postinglist) 9 postinglist = doublepostinglist(dict, token) 10 addtopostinglist(postinglist, docid(token)) 11 sorted_terms = sortterm(dict) // for merge purpose *12 writeblock(sorted_terms, dict, output_file) */ int build_board_index(char *bname) { char *word; char dirfile[PATH_MAX], docid2path[PATH_MAX], indexfile[PATH_MAX]; char filepath[PATH_MAX]; /* article file path */ char filename[20]; char cachedir[PATH_MAX]; char cachefile[PATH_MAX]; char ndocsfile[PATH_MAX]; DB *dbp; DBT key, data; int ret; int result = -1; FILE *filelist, *fp; struct postinglist *p; unsigned int docid = 1; gzFile cachefp; int gzerr; setboardfile(dirfile, bname, bname); set_brddocid2path_file(docid2path, bname); set_brdindex_file(indexfile, bname); /* Initialize the DB structure.*/ ret = db_create(&dbp, NULL, 0); if (ret != 0) { ERROR("create db hanldle failed"); goto RETURN; } if (dbopen(dbp, docid2path, 1) != 0) { ERROR1("open db %s failed", docid2path); goto RETURN; } if (!(filelist = fopen(dirfile, "r"))) { ERROR1("open file %s failed", dirfile); goto CLEAN_DB; } size_t size = 300000; /* TODO: define this constant */ struct dict_t **bucket = new_postinglist_bucket(size); if (bucket == NULL) { ERROR1("new_dict size=%u failed", size); goto CLEAN_FP; } g_text = malloc(MAX_FILE_SIZE); if (g_text == NULL) { ERROR("malloc failed"); goto CLEAN_MEM; } /* Zero out the DBTs before using them. */ memset(&key, 0, sizeof(DBT)); memset(&data, 0, sizeof(DBT)); key.size = sizeof(unsigned int); key.data = &docid; /* ensure the cache directory exists */ setcachepath(cachedir, bname); f_mkdir(cachedir, 0755); while (fgets(filename, sizeof(filename), filelist)) { filename[strlen(filename) - 1] = '\0'; data.size = strlen(filename) + 1; data.data = filename; setboardfile(filepath, bname, filename); ansi_filter(filepath); if (g_len != 0) { fprintf(stderr, "%d indexing %s\n", docid, filename); /* save to cache file */ setcachefile(cachefile, bname, filename); cachefp = gzopen(cachefile, "wb"); if (cachefp != NULL) { if (gzwrite(cachefp, g_text, g_len) != g_len) ERROR(gzerror(cachefp, &gzerr)); gzclose(cachefp); } g_pos = 0; while ((word = next_token())) { //DEBUG1("%s", word); p = get_postinglist(bucket, size, word); if (p->freq == p->size) /* is full */ double_postinglist(p); addto_postinglist(p, docid); } /* write_docid2path */ dbp->put(dbp, NULL, &key, &data, 0); docid++; } } write_index_file(bucket, size, indexfile); calc_doc_weight(bname, BOARD, docid - 1); set_brdndocs_file(ndocsfile, bname); fp = fopen(ndocsfile, "w"); if (fp == NULL) { ERROR1("fopen %s failed", ndocsfile); goto CLEAN; } fprintf(fp, "%u", docid - 1); fclose(fp); /* it's ok */ result = 0; CLEAN: free(g_text); CLEAN_MEM: free(bucket); CLEAN_FP: fclose(filelist); CLEAN_DB: if (dbp != NULL) dbp->close(dbp, 0); RETURN: return result; }