示例#1
0
int
index_file(char *filename)
{
	char *word;
	struct postinglist *p;

	
	ansi_filter(filename);
	if (g_len != 0) {
		fprintf(stderr, "%d indexing %s\n", g_docid, filename);
		/* @TODO: save to cache file */
		g_pos = 0;
		while ((word = next_token())) {
			//DEBUG1("%s", word);
			p = get_postinglist(g_bucket, g_size, word);
			if (p->freq == p->size) /* is full */
				double_postinglist(p);
			addto_postinglist(p, g_docid);
		}
		g_data.size = strlen(filename) + 1;
		g_data.data = filename;		
		/* write_docid2path */
		g_dbp->put(g_dbp, NULL, &g_key, &g_data, 0);
		g_docid++;
	}
	return 0;
}
示例#2
0
int main(int argc, char *argv[])
{
	char * t = ansi_filter(argv[1]);
	FILE *fp = fopen("output.txt", "w");
	fwrite(t, g_len, 1, fp);
	free(t);
	fclose(fp);
	return 0;
}
示例#3
0
void print_stat(const hash_t *ht, top_t **tops, int type)
{
    char file[HOMELEN];
    sprintf(file, BASEPATH"/%s", files[type]);
    FILE *fp = fopen(file, "w+");
    fprintf(fp, "                \033[1;34m-----\033[37m=====\033[41m"
            " 本%s大热门话题 \033[40m=====\033[34m-----\033[m\n\n",
            titles[type]);
    top_t *top;
    int i;
    int limit = limits[type] < ht->count ? limits[type] : ht->count;
    char date[32];
    char title[sizeof(top->title)];
    for (i = 0; i < limit; ++i) {
        top = tops[i];
        strlcpy(date, ctime(&top->last) + 4, 16);
        fprintf(fp, "\033[1;37m第\033[31m%3u\033[37m 名 \033[37m信区 : \033[33m"
                "%-18s\033[37m〖 \033[32m%s\033[37m 〗\033[36m%4d \033[37m篇"
                "\033[33m%13.13s\n     \033[37m标题 : \033[1;44m%-60.60s"
                "\033[40m\n", i + 1, top->board, date, top->count, top->owner,
                ansi_filter(title, top->title));
    }
    fclose(fp);
}
示例#4
0
/**1 output_file = new file()
  2 dict =  new hash()
  3 while (free memory available)
  4 do token = next_token()
  5     if token not in dict
  6  	   postinglist = addtodict(dict, token)
  7     else postinglist = getpostinglist(dict, token)
  8     if full(postinglist)
  9   	   postinglist = doublepostinglist(dict, token)
  10    addtopostinglist(postinglist, docid(token))
  11 sorted_terms = sortterm(dict)	// for merge purpose 
 *12 writeblock(sorted_terms, dict, output_file)
 */
int
build_board_index(char *bname)
{
	char *word;
	char dirfile[PATH_MAX], docid2path[PATH_MAX], indexfile[PATH_MAX];
	char filepath[PATH_MAX]; /* article file path */
	char filename[20];
	char cachedir[PATH_MAX];
	char cachefile[PATH_MAX];
	char ndocsfile[PATH_MAX];
	DB *dbp;
	DBT key, data;
	int ret;
	int result = -1;
	FILE *filelist, *fp;
	struct postinglist *p;
	unsigned int docid = 1;
	gzFile cachefp;
	int gzerr;

	
	setboardfile(dirfile, bname, bname);
	set_brddocid2path_file(docid2path, bname);
	set_brdindex_file(indexfile, bname);

	/* Initialize the  DB structure.*/
	ret = db_create(&dbp, NULL, 0);
	if (ret != 0) {
		ERROR("create db hanldle failed");
		goto RETURN;
	}

	if (dbopen(dbp, docid2path, 1) != 0) {
		ERROR1("open db %s failed", docid2path);
		goto RETURN;		
	}
		
	if (!(filelist = fopen(dirfile, "r"))) {
		ERROR1("open file %s failed", dirfile);
		goto CLEAN_DB;
	}
	
	size_t size = 300000;	/* TODO: define this constant */
	struct dict_t **bucket = new_postinglist_bucket(size);
	if (bucket == NULL) {
		ERROR1("new_dict size=%u failed", size);
		goto CLEAN_FP;
	}

	g_text = malloc(MAX_FILE_SIZE);
	if (g_text == NULL) {
		ERROR("malloc failed");
		goto CLEAN_MEM;
	}

	/* Zero out the DBTs before using them. */
	memset(&key, 0, sizeof(DBT));
	memset(&data, 0, sizeof(DBT));
	
	key.size = sizeof(unsigned int);
	key.data = &docid;
	
	/* ensure the cache directory exists */
	setcachepath(cachedir, bname);
	f_mkdir(cachedir, 0755);

	while (fgets(filename, sizeof(filename), filelist)) {
		filename[strlen(filename) - 1] = '\0';
	
		data.size = strlen(filename) + 1;
		data.data = filename;
		
		setboardfile(filepath, bname, filename);
		ansi_filter(filepath);
		
		if (g_len != 0) {
			fprintf(stderr, "%d indexing %s\n", docid, filename);
			/* save to cache file */
			setcachefile(cachefile, bname, filename);
			cachefp = gzopen(cachefile, "wb");
			if (cachefp != NULL) {
				if (gzwrite(cachefp, g_text, g_len) != g_len) 
					ERROR(gzerror(cachefp, &gzerr));
				gzclose(cachefp);
			}
			
			g_pos = 0;
			while ((word = next_token())) {
				//DEBUG1("%s", word);
				p = get_postinglist(bucket, size, word);
				if (p->freq == p->size) /* is full */
					double_postinglist(p);
				addto_postinglist(p, docid);
			}
			
			/* write_docid2path */
			dbp->put(dbp, NULL, &key, &data, 0);
			docid++;
		}
	}

	write_index_file(bucket, size, indexfile);
	calc_doc_weight(bname, BOARD, docid - 1);
	set_brdndocs_file(ndocsfile, bname);
	fp = fopen(ndocsfile, "w");
	if (fp == NULL) {
		ERROR1("fopen %s failed", ndocsfile);
		goto CLEAN;
	}
	fprintf(fp, "%u", docid - 1);
	fclose(fp);

	/* it's ok */
	result = 0;
CLEAN:	
	free(g_text);	
CLEAN_MEM:
	free(bucket);
CLEAN_FP:
	fclose(filelist);	
CLEAN_DB:
	if (dbp != NULL)
		dbp->close(dbp, 0);
RETURN:
	return result;
}