コード例 #1
0
ファイル: QSort.c プロジェクト: ZhanweiWu/MangoDict
static int compare_str(const void * a, const void * b)
{
	char ** a1 = (char**)a;
	char ** b1 = (char**)b;

	return stardict_strcmp(a1[0], b1[0]);
}
コード例 #2
0
gint comparefunc2(gconstpointer a,gconstpointer b)
{
        gint x;
        x = stardict_strcmp(((struct _synworditem *)a)->synword,((struct _synworditem *)b)->synword);
        if (x == 0)
                return ((struct _worditem *)a)->definition - ((struct _worditem *)b)->definition;
        else
                return x;
}
コード例 #3
0
ファイル: ydp2dict.c プロジェクト: sridrlng143/stardict-3
int cmp(const void *s1, const void *s2)
{
    PAIR *a, *b;

    a= (PAIR *)s1;
    b= (PAIR *)s2;

    return stardict_strcmp(a->words, b->words);
}
コード例 #4
0
gint comparefunc(gconstpointer a,gconstpointer b)
{
	return stardict_strcmp(((struct _worditem *)a)->word,((struct _worditem *)b)->word);
}
コード例 #5
0
ファイル: IdxsynFile.c プロジェクト: ZhanweiWu/MangoDict
boolean IdxsynFile_Lookup(IdxsynFile * pMe, const char *str, long *idx, long *idx_suggest)
{
	long idx2 = 0, idx_suggest2 = 0;
	boolean bFound = FALSE;
	long iFrom = 0;
	long iTo = pMe->npages - 2;
	int cmpint = 0;
	long iThisIndex = 0;

	if (stardict_strcmp(str, pMe->first.keystr)<0) {
		*idx = 0;
		*idx_suggest = 0;
		return FALSE;
	} else if (stardict_strcmp(str, pMe->real_last.keystr) >0) {
		*idx = INVALID_INDEX;
		*idx_suggest = pMe->wordcount - 1;
		return FALSE;
	} else {
		// find the page number where the search word might be
		iFrom = 0;
		iThisIndex = 0;
		while (iFrom <= iTo) {
			iThisIndex = (iFrom + iTo) / 2;
			cmpint = stardict_strcmp(str, IdxsynFile_GetFirstOnPageKey(pMe, iThisIndex));
			if (cmpint > 0)
				iFrom = iThisIndex + 1;
			else if (cmpint < 0)
				iTo = iThisIndex - 1;
			else {
				bFound = TRUE;
				break;
			}
		}
		if (!bFound) {
			idx2 = iTo;    //prev
		} else {
			idx2 = iThisIndex;
		}
	}
	if (!bFound) {
		// the search word is on the page number idx if it's anywhere
		uint32 netr = IdxsynFile_LoadPage(pMe, idx2);
		iFrom =  1; // Needn't search the first word anymore.
		iTo = netr - 1;
		iThisIndex = 0;
		while (iFrom <= iTo) {
			iThisIndex = (iFrom + iTo) / 2;
			cmpint = stardict_strcmp(str, pMe->page.entries[iThisIndex].keystr);
			if (cmpint > 0)
				iFrom = iThisIndex + 1;
			else if (cmpint < 0)
				iTo = iThisIndex - 1;
			else {
				bFound = TRUE;
				break;
			}
		}
		idx2 *= ENTR_PER_PAGE;
		if (!bFound) {
			int best, back;
			idx2 += iFrom;    //next
			idx_suggest2 = idx2;

			best = prefix_match (str, pMe->page.entries[idx_suggest2 % ENTR_PER_PAGE].keystr);
			for (;;) {
				if ((iTo = idx_suggest2 - 1) < 0)
					break;
				if (idx_suggest2 % ENTR_PER_PAGE == 0)
					IdxsynFile_LoadPage(pMe, iTo / ENTR_PER_PAGE);
				back = prefix_match (str, pMe->page.entries[iTo % ENTR_PER_PAGE].keystr);
				if (!back || back < best)
					break;
				best = back;
				idx_suggest2 = iTo;
			}
		} else {
			idx2 += iThisIndex;
			idx_suggest2 = idx2;
		}
	} else {
		idx2 *=ENTR_PER_PAGE;
		idx_suggest2 = idx2;
	}

	*idx = idx2;
	*idx_suggest = idx_suggest2;

	return bFound;
}
コード例 #6
0
void builddata(gchar *datafilename, glong &wordcount, glong &idxfilesize, glong &synwordcount, std::list<std::string> *TagList, std::list<std::string> *ElementList)
{
	struct stat stats;
	if (stat (datafilename, &stats) == -1) {
		printf("File %s not exist!\n", datafilename);
		return;
	}
	FILE *datafile;
	datafile = fopen(datafilename,"r");
	gchar *buffer = (gchar *)g_malloc (stats.st_size + 1);
	fread (buffer, 1, stats.st_size, datafile);
	fclose (datafile);
	buffer[stats.st_size] = '\0';

	GArray *array = g_array_sized_new(FALSE,FALSE, sizeof(struct _worditem),20000);
	GArray *array2 = g_array_sized_new(FALSE,FALSE, sizeof(struct _synworditem),20000);
	gchar *p, *p1, *p2, *p3, *p4, *p5;
	p = buffer;
	struct _worditem worditem;
	while (1) {
		p1 = strstr(p, "<单词块>");
		if (!p1) {
			g_print("over\n");
			break;
		}
		p1 += strlen("<单词块>");
		p2 = strstr(p1, "</单词块>");
		if (!p2) {
			g_print("Error, no </单词块>\n");
			return;
		}
		*p2='\0';
		p2 += strlen("</单词块>");
		p3 = strstr(p1, "<单词>");
		p5 = p1;
		while (g_ascii_isspace(*p5))
			p5++;
		if (p5!=p3) {
			g_print("Warning, not begin with <单词>.\n");
		}
		if (!p3) {
			g_print("Error, no <单词>\n");
			return;
		}
		p3 += strlen("<单词>");
		p4 = strstr(p3, "</单词>");
		if (!p4) {
			g_print("Error, no </单词>\n");
			return;
		}
		*p4='\0';
		p4 += strlen("</单词>");
		worditem.word = get_cdata(p3);
		if (!worditem.word) {
			return;
		}
		if (!worditem.word[0]) {
			g_print("Bad word!\n");
			p = p2;
			continue;
		}
		while (g_ascii_isspace(*p4)) {
			p4++;
		}
		worditem.definition = p4;
		g_strstrip(worditem.definition);
		if (!worditem.definition[0]) {
			g_print("Bad definition!\n");
			return;
		}
		ParseUserData Data;
		Data.word = worditem.word;
		Data.definition = worditem.definition;
		std::list<std::string> WordList;
		Data.WordList = &WordList;
		Data.array = array2;
		Data.TagList = TagList;
		Data.ElementList = ElementList;
		parse_definition(worditem.definition, &Data);
		g_array_append_val(array, worditem);
		p = p2;
	}
	g_array_sort(array,comparefunc);
	g_array_sort(array2,comparefunc2);

	gchar *basefilename = g_strdup(datafilename);
	p = strchr(basefilename, '.');
	if (p)
		*p='\0';
	gchar idxfilename[256];
	gchar dicfilename[256];
	sprintf(idxfilename, "powerword2007_%s.idx", basefilename);
	sprintf(dicfilename, "powerword2007_%s.dict", basefilename);
	FILE *idxfile = fopen(idxfilename,"w");
	FILE *dicfile = fopen(dicfilename,"w");

	guint32 offset_old;
        guint32 tmpglong;
        struct _worditem *pworditem;         gint definition_len;
        gulong i;
        for (i=0; i< array->len; i++) {
                offset_old = ftell(dicfile);
                pworditem = &g_array_index(array, struct _worditem, i);
                definition_len = strlen(pworditem->definition);
                fwrite(pworditem->definition, 1 ,definition_len,dicfile);
                fwrite(pworditem->word,sizeof(gchar),strlen(pworditem->word)+1,idxfile);
                tmpglong = g_htonl(offset_old);
                fwrite(&(tmpglong),sizeof(guint32),1,idxfile);
                tmpglong = g_htonl(definition_len);
                fwrite(&(tmpglong),sizeof(guint32),1,idxfile);
        }
	idxfilesize = ftell(idxfile);
        fclose(idxfile);
	fclose(dicfile);
	g_print("%s wordcount: %d\n", datafilename, array->len);
	wordcount = array->len;

	synwordcount = array2->len;
	if (array2->len) {
                gchar synfilename[256];
                sprintf(synfilename, "powerword2007_%s.syn", basefilename);
                FILE *synfile = fopen(synfilename,"w");
                struct _synworditem *psynworditem;
                gint iFrom, iTo, iThisIndex, cmpint;
                bool bFound;
                for (i=0; i< array2->len; i++) {
                        psynworditem = &g_array_index(array2, struct _synworditem, i);
                        fwrite(psynworditem->synword, 1, strlen(psynworditem->synword)+1, synfile);
			g_free(psynworditem->synword);
                        bFound=false;
                        iFrom=0;
                        iTo=array->len-1;
                        while (iFrom<=iTo) {
                                iThisIndex=(iFrom+iTo)/2;
                                pworditem = &g_array_index(array, struct _worditem, iThisIndex);
                                cmpint = stardict_strcmp(psynworditem->origword, pworditem->word);
                                if (cmpint>0)
                                        iFrom=iThisIndex+1;
                                else if (cmpint<0)
                                        iTo=iThisIndex-1;
                                else {
                                        bFound=true;
                                        break;
                                }

                        }
                        if (!bFound) {
                                g_print("Error, %s not find.\n", psynworditem->origword);
                                return;
                        }
                        do {
                                if (iThisIndex==0)
                                        break;
                                pworditem = &g_array_index(array, struct _worditem, iThisIndex-1);
				if (strcmp(psynworditem->origword, pworditem->word)==0)
                                        iThisIndex--;
                                else
                                        break;
                        } while (true);
                        bFound=false;
                        do {
                                pworditem = &g_array_index(array, struct _worditem, iThisIndex);
                                if (strcmp(psynworditem->origword, pworditem->word)==0) {
                                        if (psynworditem->definition == pworditem->definition) {
                                                bFound=true;
                                                break;
                                        } else
                                                iThisIndex++;
                                } else
                                        break;
                        } while (true);
                        if (!bFound) {
                                g_print("Error, %s definition not find.\n", psynworditem->origword);
                                return;
                        }
                        tmpglong = g_htonl(iThisIndex);
                        fwrite(&(tmpglong),sizeof(guint32),1, synfile);
                }
                fclose(synfile);
                g_print("synwordcount: %d\n", array2->len);
        }
コード例 #7
0
VerifResult binary_dict_parser_t::load_syn_file(void)
{
	synfilename = basefilename + ".syn";
	VerifResult result = VERIF_RESULT_OK;

	if (dict_info.get_synwordcount() == 0) {
		if (g_file_test(synfilename.c_str(), G_FILE_TEST_EXISTS)) {
			g_warning(syn_file_exist_msg);
			result = combine_result(result, VERIF_RESULT_WARNING);
			if(fix_errors) {
				g_message(fixed_process_syn_file_msg);
			} else
				return result;
		} else
			return result;
	}

	guint32 synfilesize;
	{
		stardict_stat_t stats;
		if (g_stat (synfilename.c_str(), &stats) == -1) {
			std::string error(g_strerror(errno));
			g_warning(syn_file_no_found_msg, synfilename.c_str(), error.c_str());
			result = VERIF_RESULT_CRITICAL;
			if(fix_errors) {
				dict_info.set_synwordcount(0);
				g_message(fixed_ignore_syn_file_msg);
				return result;
			} else
				return result;
		}
		synfilesize = stats.st_size;
	}
	g_message(loading_syn_file_msg, synfilename.c_str());

	synindex.clear();
	synindex.reserve(std::min(MAX_RESERVED_INDEX_SIZE, dict_info.get_synwordcount()));

	std::vector<gchar> buf(synfilesize+1);
	gchar *buffer_begin = &buf[0];
	gchar *buffer_end = buffer_begin+synfilesize;
	{
		FILE *synfile = g_fopen(synfilename.c_str(),"rb");
		if(!synfile) {
			std::string error(g_strerror(errno));
			g_warning(open_read_file_err, synfilename.c_str(), error.c_str());
			result = VERIF_RESULT_CRITICAL;
			if(fix_errors) {
				dict_info.set_synwordcount(0);
				g_message(fixed_ignore_syn_file_msg);
				return result;
			} else
				return result;
		}
		if(synfilesize != fread (buffer_begin, 1, synfilesize, synfile)) {
			std::string error(g_strerror(errno));
			g_warning(open_read_file_err, synfilename.c_str(), error.c_str());
			result = VERIF_RESULT_CRITICAL;
			fclose (synfile);
			if(fix_errors) {
				dict_info.set_synwordcount(0);
				g_message(fixed_ignore_syn_file_msg);
				return result;
			} else
				return result;
		}
		fclose (synfile);
	}

	const char *p=buffer_begin;
	int wordlen;
	gint cmpvalue;
	guint wordcount=0;
	synitem_t synitem, presynitem;
	size_t size_remain; // to the end of the synonyms file

	while (p < buffer_end) {
		size_remain = buffer_end - p;
		const char* const word_end = reinterpret_cast<const char*>(memchr(p, '\0', size_remain));
		if(!word_end) {
			g_warning(syn_file_truncated_err);
			result = combine_result(result, VERIF_RESULT_CRITICAL);
			if(fix_errors)
				g_message(fixed_ignore_file_tail_msg);
			break;
		}
		synitem.word = p;
		wordlen = synitem.word.length();
		if (!g_utf8_validate(synitem.word.c_str(), wordlen, NULL)) {
			g_warning(word_invalid_utf8_err, synitem.word.c_str());
			result = combine_result(result, VERIF_RESULT_CRITICAL);
			if(fix_errors) {
				synitem.word = fix_utf8_str(synitem.word);
				wordlen = synitem.word.length();
				g_message(fixed_utf8_drop_invalid_char_msg);
			}
		}
		{	// check for invalid chars
			typedef std::list<const char*> str_list_t;
			str_list_t invalid_chars;
			const char* const word = synitem.word.c_str();
			if(check_xml_string_chars(word, invalid_chars)) {
				result = combine_result(result, VERIF_RESULT_WARNING);
				g_message(word_invalid_char_value_err,
					word, print_char_codes(invalid_chars).c_str());
				if(fix_errors) {
					g_message(fixed_drop_invalid_char_msg);
					fix_xml_string_chars(word, synitem.word);
					wordlen = synitem.word.length();
				}
			}
		}
		if (wordlen > 0) {
			if (wordlen>=MAX_INDEX_KEY_SIZE) {
				g_warning(long_word_err, synitem.word.c_str(), MAX_INDEX_KEY_SIZE, wordlen);
				result = combine_result(result, VERIF_RESULT_CRITICAL);
				if(fix_errors) {
					wordlen = truncate_utf8_string(synitem.word.c_str(), wordlen, MAX_INDEX_KEY_SIZE-1);
					synitem.word.resize(wordlen);
					g_message(fixed_word_truncated_msg);
				}
			}
			bool have_spaces = false;
			if (g_ascii_isspace(synitem.word[0])) {
				g_message(word_begin_space_err, synitem.word.c_str());
				result = combine_result(result, VERIF_RESULT_NOTE);
				have_spaces = true;
			}
			if (g_ascii_isspace(synitem.word[wordlen-1])) {
				g_message(word_end_space_err, synitem.word.c_str());
				result = combine_result(result, VERIF_RESULT_NOTE);
				have_spaces = true;
			}
			if(have_spaces && fix_errors) {
				g_message(fixed_trim_spaces);
				const char* new_beg;
				size_t new_len;
				trim_spaces(synitem.word.c_str(), new_beg, new_len);
				if(new_len == 0)
					synitem.word.clear();
				else {
					std::string tmp(new_beg, new_len);
					synitem.word = tmp;
				}
			}
		}
		if (check_stardict_key_chars(synitem.word.c_str())) {
			g_message(word_forbidden_chars_err, synitem.word.c_str());
			result = combine_result(result, VERIF_RESULT_NOTE);
			if(fix_errors) {
				g_message(fixed_drop_invalid_char_msg);
				std::string tmp;
				fix_stardict_key_chars(synitem.word.c_str(), tmp);
				synitem.word = tmp;
				wordlen = synitem.word.length();
			}
		}
		if (wordlen==0) {
			g_warning(empty_word_err);
			result = combine_result(result, VERIF_RESULT_WARNING);
			if(fix_errors)
				g_message(fixed_ignore_word_msg);
		}
		if (!presynitem.word.empty() && !synitem.word.empty()) {
			cmpvalue=stardict_strcmp(presynitem.word.c_str(), synitem.word.c_str());
			if (cmpvalue>0) {
				g_warning(wrong_word_order_err, presynitem.word.c_str(), synitem.word.c_str());
				result = combine_result(result, VERIF_RESULT_WARNING);
				if(fix_errors)
					g_message(fixed_words_reordered_msg);
			}
		}
		p = word_end +1;
		size_remain = buffer_end - p;
		if(size_remain < sizeof(guint32)) {
			g_warning(syn_file_truncated_err);
			result = combine_result(result, VERIF_RESULT_CRITICAL);
			if(fix_errors)
				g_message(fixed_ignore_file_tail_msg);
			break;
		}
		synitem.index = g_ntohl(*reinterpret_cast<const guint32 *>(p));
		if (synitem.index>=dict_info.get_wordcount()) {
			g_warning(wrong_index_err, synitem.word.c_str(), synitem.index);
			result = combine_result(result, VERIF_RESULT_CRITICAL);
			if(fix_errors) {
				synitem.word.clear();
				g_message(fixed_ignore_word_msg);
			}
		}
		p+=sizeof(guint32);
		presynitem = synitem;
		wordcount++;
		synindex.push_back(synitem);
	} // while

	g_assert(p <= buffer_end);

	if (wordcount != dict_info.get_synwordcount()) {
		g_warning(incorrect_syn_word_cnt_err,
			dict_info.get_synwordcount(), wordcount);
		result = combine_result(result, VERIF_RESULT_CRITICAL);
		if(fix_errors) {
			dict_info.set_synwordcount(wordcount);
			g_message(fixed_msg);
		}
	}

	for(size_t i=0; i < synindex.size(); ++i) {
		for(size_t j=i+1; j < synindex.size() && synindex[i].word == synindex[j].word; ++j) {
			if(synindex[i].index == synindex[j].index) {
				g_warning(duplicate_syn_item_err,
					synindex[i].word.c_str(), synindex[i].index);
				result = combine_result(result, VERIF_RESULT_NOTE);
				break;
			}
		}
	}

	if((fix_errors ? VERIF_RESULT_FATAL : VERIF_RESULT_CRITICAL) <= result) {
		g_warning(load_syn_file_failed_err, synfilename.c_str());
		if(fix_errors) {
			dict_info.set_synwordcount(0);
			synindex.clear();
			g_message(fixed_ignore_syn_file_msg);
			result = VERIF_RESULT_CRITICAL;
		}
	}
	return result;
}
コード例 #8
0
VerifResult binary_dict_parser_t::load_idx_file(void)
{
	VerifResult result = VERIF_RESULT_OK;
	{
		VerifResult res = prepare_idx_file();
		result = combine_result(result, res);
		if((fix_errors ? VERIF_RESULT_FATAL : VERIF_RESULT_CRITICAL) <= res)
			return result;
	}

	guint32 idxfilesize;
	{
		stardict_stat_t stats;
		if (g_stat (idxfilename.c_str(), &stats) == -1) {
			std::string error(g_strerror(errno));
			g_critical(file_not_found_idx_err, idxfilename.c_str(), error.c_str());
			return combine_result(result, VERIF_RESULT_FATAL);
		}
		idxfilesize = (guint32)stats.st_size;
	}
	g_message(loading_idx_file_msg, idxfilename_orig.c_str());

	if (dict_info.get_index_file_size() != idxfilesize) {
		g_warning(incorrect_idx_file_size_err,
			dict_info.get_index_file_size(), idxfilesize);
		result = combine_result(result, VERIF_RESULT_CRITICAL);
		if(fix_errors) {
			dict_info.set_index_file_size(idxfilesize);
			g_message(fixed_msg);
		} else
			return result;
	}

	index.clear();
	index.reserve(std::min(MAX_RESERVED_INDEX_SIZE, dict_info.get_wordcount()));

	std::vector<gchar> buf(idxfilesize+1);
	gchar * const buffer_beg = &buf[0];
	gchar * const buffer_end = buffer_beg+idxfilesize;
	{
		FILE *idxfile = g_fopen(idxfilename.c_str(),"rb");
		if(!idxfile) {
			std::string error(g_strerror(errno));
			g_critical(open_read_file_err, idxfilename.c_str(), error.c_str());
			return combine_result(result, VERIF_RESULT_FATAL);
		}
		if(idxfilesize != fread(buffer_beg, 1, idxfilesize, idxfile)) {
			std::string error(g_strerror(errno));
			g_critical(open_read_file_err, idxfilename.c_str(), error.c_str());
			fclose(idxfile);
			return combine_result(result, VERIF_RESULT_FATAL);
		}
		fclose(idxfile);
	}

	const char *p=buffer_beg;
	int wordlen;
	gint cmpvalue;
	guint wordcount=0;
	worditem_t worditem, preworditem;
	size_t size_remain; // to the end of the index file

	while (p < buffer_end) {
		size_remain = buffer_end - p;
		const char* const word_end = reinterpret_cast<const char*>(memchr(p, '\0', size_remain));
		if(!word_end) {
			g_warning(index_file_truncated_err);
			result = combine_result(result, VERIF_RESULT_CRITICAL);
			if(fix_errors)
				g_message(fixed_ignore_file_tail_msg);
			break;
		}
		worditem.word = p;
		wordlen = worditem.word.length();
		if (!g_utf8_validate(worditem.word.c_str(), wordlen, NULL)) {
			g_warning(word_invalid_utf8_err, worditem.word.c_str());
			result = combine_result(result, VERIF_RESULT_CRITICAL);
			if(fix_errors) {
				worditem.word = fix_utf8_str(worditem.word, 0);
				wordlen = worditem.word.length();
				g_message(fixed_utf8_drop_invalid_char_msg);
			}
		}
		{	// check for invalid chars
			typedef std::list<const char*> str_list_t;
			str_list_t invalid_chars;
			const char* const word = worditem.word.c_str();
			if(check_xml_string_chars(word, invalid_chars)) {
				result = combine_result(result, VERIF_RESULT_WARNING);
				g_message(word_invalid_char_value_err,
					word, print_char_codes(invalid_chars).c_str());
				if(fix_errors) {
					g_message(fixed_drop_invalid_char_msg);
					fix_xml_string_chars(word, worditem.word);
					wordlen = worditem.word.length();
				}
			}
		}
		if (wordlen > 0) {
			if (wordlen>=MAX_INDEX_KEY_SIZE) {
				g_warning(long_word_err, worditem.word.c_str(), MAX_INDEX_KEY_SIZE, wordlen);
				result = combine_result(result, VERIF_RESULT_CRITICAL);
				if(fix_errors) {
					wordlen = truncate_utf8_string(worditem.word.c_str(), wordlen, MAX_INDEX_KEY_SIZE-1);
					worditem.word.resize(wordlen);
					g_message(fixed_word_truncated_msg);
				}
			}
			bool have_spaces = false;
			if (g_ascii_isspace(worditem.word[0])) {
				g_message(word_begin_space_err, worditem.word.c_str());
				result = combine_result(result, VERIF_RESULT_NOTE);
				have_spaces = true;
			}
			if (g_ascii_isspace(worditem.word[wordlen-1])) {
				g_message(word_end_space_err, worditem.word.c_str());
				result = combine_result(result, VERIF_RESULT_NOTE);
				have_spaces = true;
			}
			if(have_spaces && fix_errors) {
				g_message(fixed_trim_spaces);
				const char* new_beg;
				size_t new_len;
				trim_spaces(worditem.word.c_str(), new_beg, new_len);
				if(new_len == 0)
					worditem.word.clear();
				else {
					std::string tmp(new_beg, new_len);
					worditem.word = tmp;
				}
			}
		}
		if(check_stardict_key_chars(worditem.word.c_str())) {
			g_message(word_forbidden_chars_err, worditem.word.c_str());
			result = combine_result(result, VERIF_RESULT_NOTE);
			if(fix_errors) {
				g_message(fixed_drop_invalid_char_msg);
				std::string tmp;
				fix_stardict_key_chars(worditem.word.c_str(), tmp);
				worditem.word = tmp;
				wordlen = worditem.word.length();
			}
		}
		if (wordlen==0) {
			g_warning(empty_word_err);
			result = combine_result(result, VERIF_RESULT_WARNING);
			if(fix_errors)
				g_message(fixed_ignore_word_msg);
		}
		if (!preworditem.word.empty() && !worditem.word.empty()) {
			cmpvalue=stardict_strcmp(preworditem.word.c_str(), worditem.word.c_str());
			if (cmpvalue>0) {
				g_warning(wrong_word_order_err, preworditem.word.c_str(), worditem.word.c_str());
				result = combine_result(result, VERIF_RESULT_WARNING);
				if(fix_errors)
					g_message(fixed_words_reordered_msg);
			}
		}
		p = word_end + 1;
		size_remain = buffer_end - p;
		if(size_remain < 2 * sizeof(guint32)) {
			g_warning(index_file_truncated_err);
			result = combine_result(result, VERIF_RESULT_CRITICAL);
			if(fix_errors)
				g_message(fixed_ignore_file_tail_msg);
			break;
		}
		worditem.offset = g_ntohl(*reinterpret_cast<const guint32 *>(p));
		p += sizeof(guint32);
		worditem.size = g_ntohl(*reinterpret_cast<const guint32 *>(p));
		p += sizeof(guint32);
		if (worditem.size==0) {
			g_warning(empty_block_err, worditem.word.c_str());
			result = combine_result(result, VERIF_RESULT_WARNING);
			if(fix_errors) {
				worditem.word.clear();
				g_message(fixed_ignore_word_msg);
			}
		}
		preworditem = worditem;
		wordcount++;
		index.push_back(worditem);
	} // while

	g_assert(p <= buffer_end);

	if (dict_info.get_wordcount() != wordcount) {
		g_warning(incorrect_word_cnt_err, dict_info.get_wordcount(), wordcount);
		result = combine_result(result, VERIF_RESULT_CRITICAL);
		if(fix_errors) {
			dict_info.set_wordcount(wordcount);
			g_message(fixed_msg);
		}
	}

	for(size_t i=0; i < index.size(); ++i) {
		if(index[i].word.empty())
			continue;
		for(size_t j=i+1; j < index.size() && index[i].word == index[j].word; ++j) {
			if(index[i].offset == index[j].offset && index[i].size == index[j].size) {
				g_warning(duplicate_index_item_err,
					index[i].word.c_str(), index[i].offset, index[i].size);
				result = combine_result(result, VERIF_RESULT_NOTE);
				break;
			}
		}
	}

	return result;
}
コード例 #9
0
void convert_babylonfile(const char *filename, print_info_t print_info, bool strip_html)
{			
	struct stat stats;
	if (g_stat (filename, &stats) == -1)
	{
		print_info("File not exist!\n");
		return;
	}
	gchar *basefilename = g_path_get_basename(filename);
	gchar *ch = strrchr(basefilename, '.');
	if (ch)
		*ch = '\0';
	gchar *dirname = g_path_get_dirname(filename);
	FILE *tabfile;
	tabfile = g_fopen(filename,"r");

	gchar *buffer = (gchar *)g_malloc (stats.st_size + 1);
	size_t readsize = fread (buffer, 1, stats.st_size, tabfile);
	fclose (tabfile);
	buffer[readsize] = '\0';	
	
	GArray *array = g_array_sized_new(FALSE,FALSE, sizeof(struct _worditem),20000);
	GArray *array2 = g_array_sized_new(FALSE,FALSE, sizeof(struct _synworditem),20000);
		
	gchar *p, *p1, *p2, *p3, *p4, *p5;
	p = buffer;
	if ((guchar)*p==0xEF && (guchar)*(p+1)==0xBB && (guchar)*(p+2)==0xBF) // UTF-8 order characters.
		p+=3;
	struct _worditem worditem;
	struct _synworditem synworditem;
	gint linenum=1;
	int stripmethod;
	if (strip_html)
		stripmethod = 0;
	else
		stripmethod = 1;
	std::string sametypesequence = "m";
	std::string bookname;
	std::string author;
	std::string email;
	std::string website;
	std::string description;
	std::string date;
	bool print_sameword;
	if (*p == '\n') {
		print_sameword = false;
		p++;
		linenum++;
		while (1) {
			if (*p == '\n') {
				p++;
				linenum++;
				break;
			}
			p++;
			p1 = strchr(p, '\n');
			if (!p1) {
				return;
			}
			*p1 = '\0';
			p1++;
			linenum++;
			if (g_str_has_prefix(p, "stripmethod=")) {
				p += sizeof("stripmethod=") -1;
				if (strcmp(p, "striphtml")==0)
					stripmethod = 0;
				else if (strcmp(p, "stripnewline")==0)
					stripmethod = 1;
				else if (strcmp(p, "keep")==0)
					stripmethod = 2;
			} else if (g_str_has_prefix(p, "sametypesequence=")) {
				p += sizeof("sametypesequence=") -1;
				sametypesequence = p;
			} else if (g_str_has_prefix(p, "bookname=")) {
				p += sizeof("bookname=") -1;
				bookname = p;
			} else if (g_str_has_prefix(p, "author=")) {
				p += sizeof("author=") -1;
				author = p;
			} else if (g_str_has_prefix(p, "email=")) {
				p += sizeof("email=") -1;
				email = p;
			} else if (g_str_has_prefix(p, "website=")) {
				p += sizeof("website=") -1;
				website = p;
			} else if (g_str_has_prefix(p, "date=")) {
				p += sizeof("date=") -1;
				date = p;
			} else if (g_str_has_prefix(p, "description=")) {
				p += sizeof("description=") -1;
				description = p;
			}
			p = p1;
		}
	} else {
		print_sameword = true;
	}
	while (1) {
		if (*p == '\0') {
                        print_info("Over\n");
                        break;
                }
		p1 = strchr(p,'\n');
		if (!p1) {
			gchar *str = g_strdup_printf("Error, no end line 1: %d\n", linenum);
			print_info(str);
			g_free(str);
			return;
		}
		*p1 = '\0';
		p1++;
		linenum++;
		p2 = strchr(p1,'\n');
		if (!p2) {
			gchar *str = g_strdup_printf("Error, no end line 2: %d\n", linenum);
			print_info(str);
			g_free(str);
			return;
		}
		*p2 = '\0';
		p2++;
		linenum++;
		p3=p2;
		if (*p3 != '\n') {
			gchar *str = g_strdup_printf("Error, not null line %d", linenum);
			print_info(str);
			g_free(str);
			return;
		}
		*p3='\0';
		p3++;
		linenum++;
		
		if (stripmethod == 0) {
			html_strstrip(p1, linenum-2, print_info);
		} else if (stripmethod == 1) {
			newline_strstrip(p1, linenum-2, print_info);
		} else if (stripmethod == 2) {
		}
		g_strstrip(p1);
		if (!(*p1)) {
			gchar *str = g_strdup_printf("%s-%d, bad definition!!!\n", basefilename, linenum-1);
			print_info(str);
			g_free(str);
			p= p3;
                        continue;
		}	
		
		p4 = strchr(p, '|');
		if (p4) {
			*p4 = '\0';
			worditem.word = p;
                        g_strstrip(worditem.word);
                        if (!worditem.word[0]) {
				gchar *str = g_strdup_printf("%s-%d, bad word!!!\n", basefilename, linenum-2);
				print_info(str);
				g_free(str);
                                p=p3;
                                continue;
                        }
			worditem.definition = p1;
                        g_array_append_val(array, worditem);
			std::list <std::string> WordList;
			WordList.push_back(worditem.word);
			p4++;
			while (true) {
				p5 = strchr(p4, '|');
				if (p5) {
					*p5 = '\0';
					synworditem.synword = p4;
					g_strstrip(synworditem.synword);
                        		if (!synworditem.synword[0]) {
						gchar *str = g_strdup_printf("%s-%d, bad word!!!\n", basefilename, linenum-2);
						print_info(str);
						g_free(str);
                				p4 = p5+1;
		                                continue;
                		        }
					bool find = false;
					for (std::list<std::string>::const_iterator it=WordList.begin(); it!=WordList.end(); ++it) {
						if (*it == synworditem.synword) {
							find= true;
							break;
						}
					}
					if (find) {
						if (print_sameword) {
							gchar *str = g_strdup_printf("Same word: %s\n", synworditem.synword);
							print_info(str);
							g_free(str);
						}
						p4 = p5+1;
						continue;
					} else {
						WordList.push_back(synworditem.synword);
					}
					synworditem.origword = worditem.word;
					synworditem.definition = worditem.definition;
					g_array_append_val(array2, synworditem);
					p4 = p5+1;
				} else {
					synworditem.synword = p4;
					g_strstrip(synworditem.synword);
                                        if (!synworditem.synword[0]) {
						gchar *str = g_strdup_printf("%s-%d, bad word!!!\n", basefilename, linenum-2);
						print_info(str);
						g_free(str);
                                                break;
                                        }
					bool find = false;
					for (std::list<std::string>::const_iterator it=WordList.begin(); it!=WordList.end(); ++it) {
						if (*it == synworditem.synword) {
							find= true;
							break;
						}
					}
					if (find) {
						if (print_sameword) {
							gchar *str = g_strdup_printf("Same word: %s\n", synworditem.synword);
							print_info(str);
							g_free(str);
						}
						break;
					}
					synworditem.origword = worditem.word;
                                        synworditem.definition = worditem.definition;
                                        g_array_append_val(array2, synworditem);
					break;
				}
			}
		} else {
			worditem.word = p;
			g_strstrip(worditem.word);
			if (!worditem.word[0]) {
				gchar *str = g_strdup_printf("%s-%d, bad word!!!\n", basefilename, linenum-2);
				print_info(str);
				g_free(str);
				p=p3;
				continue;
			}
			worditem.definition = p1;
			g_array_append_val(array, worditem);
		}
		p= p3;
	}		
	g_array_sort(array,comparefunc);
	g_array_sort(array2,comparefunc2);

	gchar ifofilename[256];
	gchar idxfilename[256];
	gchar dicfilename[256];
	sprintf(ifofilename, "%s" G_DIR_SEPARATOR_S "%s.ifo", dirname, basefilename);
	sprintf(idxfilename, "%s" G_DIR_SEPARATOR_S "%s.idx", dirname, basefilename);
	sprintf(dicfilename, "%s" G_DIR_SEPARATOR_S "%s.dict", dirname, basefilename);
	FILE *ifofile = g_fopen(ifofilename,"wb");
	FILE *idxfile = g_fopen(idxfilename,"wb");
	FILE *dicfile = g_fopen(dicfilename,"wb");
	
	guint32 offset_old;
	guint32 tmpglong;
	struct _worditem *pworditem;
	gint definition_len;
	gulong i;
	for (i=0; i< array->len; i++) {
		offset_old = ftell(dicfile);
		pworditem = &g_array_index(array, struct _worditem, i);
		definition_len = strlen(pworditem->definition);
		fwrite(pworditem->definition, 1 ,definition_len,dicfile);
		fwrite(pworditem->word,sizeof(gchar),strlen(pworditem->word)+1,idxfile);
                tmpglong = g_htonl(offset_old);
                fwrite(&(tmpglong),sizeof(guint32),1,idxfile);
                tmpglong = g_htonl(definition_len);
                fwrite(&(tmpglong),sizeof(guint32),1,idxfile);

	}
	fclose(idxfile);
	fclose(dicfile);

	gchar *str = g_strdup_printf("%s wordcount: %d\n", basefilename, array->len);
	print_info(str);
	g_free(str);

	if (array2->len) {
		gchar synfilename[256];
	        sprintf(synfilename, "%s" G_DIR_SEPARATOR_S "%s.syn", dirname, basefilename);
		FILE *synfile = g_fopen(synfilename,"wb");
		struct _synworditem *psynworditem;
		gint iFrom, iTo, iThisIndex, cmpint;
		bool bFound;
		for (i=0; i< array2->len; i++) {
			psynworditem = &g_array_index(array2, struct _synworditem, i);
			fwrite(psynworditem->synword, 1, strlen(psynworditem->synword)+1, synfile);
			bFound=false;
			iFrom=0;
			iTo=array->len-1;
			while (iFrom<=iTo) {
				iThisIndex=(iFrom+iTo)/2;
				pworditem = &g_array_index(array, struct _worditem, iThisIndex);
				cmpint = stardict_strcmp(psynworditem->origword, pworditem->word);
				if (cmpint>0)
					iFrom=iThisIndex+1;
				else if (cmpint<0)
					iTo=iThisIndex-1;
				else {
					bFound=true;
					break;
				}
				
			}
			if (!bFound) {
				gchar *str = g_strdup_printf("Error, %s not find.\n", psynworditem->origword);
				print_info(str);
				g_free(str);
				return;
			}
			do {
				if (iThisIndex==0)
					break;
				pworditem = &g_array_index(array, struct _worditem, iThisIndex-1);
				if (strcmp(psynworditem->origword, pworditem->word)==0)
					iThisIndex--;
				else
					break;
			} while (true);
			bFound=false;
			do {
				pworditem = &g_array_index(array, struct _worditem, iThisIndex);
				if (strcmp(psynworditem->origword, pworditem->word)==0) {
					if (psynworditem->definition == pworditem->definition) {
						bFound=true;
						break;
					} else
						iThisIndex++;
				} else
					break;
			} while (true);
			if (!bFound) {
				gchar *str = g_strdup_printf("Error, %s definition not find.\n", psynworditem->origword);
				print_info(str);
				g_free(str);
                                return;
                        }
			tmpglong = g_htonl(iThisIndex);
	                fwrite(&(tmpglong),sizeof(guint32),1, synfile);
		}
		fclose(synfile);
		gchar *str = g_strdup_printf("%s synwordcount: %d\n", basefilename, array2->len);
		print_info(str);
		g_free(str);
	}