int read_doc_and_call_function(const char *filename, const struct stat *status, int type) { UNUSED(status); if (type != FTW_F) return 0; //not a file xmlDoc *document = read_document(filename); //if desired, check language or normalize unicode if (TRAVERSAL_PARAMETERS & LOADER_CHECK_LANGUAGE) { dnmPtr dnm = create_DNM(xmlDocGetRootElement(document), DNM_SKIP_TAGS); if (dnm == NULL) { fprintf(stderr, "Couldn't create DNM of %s (fatal)\n", filename); exit(1); } if (!text_is_english(TEXT_CAT_HANDLE, dnm->plaintext, dnm->size_plaintext)) { //if it isn't primarily english fprintf(TRAVERSAL_LOG_FILE, "Dismissing %s (appears not to be in English)\n", filename); xmlFreeDoc(document); return 0; } free_DNM(dnm); } if (TRAVERSAL_PARAMETERS & LOADER_NORMALIZE_UNICODE) unicode_normalize_dom(document); //call the function passed by the user int ret_val = FUNCTION_FOR_DOCS(document, filename); xmlFreeDoc(document); return ret_val; }
int parse(const char *filename, const struct stat *status, int type) { if (type != FTW_F) return 0; //Not a file UNUSED(status); fprintf(stderr, "%s\n", filename); xmlDoc *doc = read_document(filename); if (doc == NULL) return 0; //error message printed by read_document dnmPtr dnm = create_DNM(xmlDocGetRootElement(doc), DNM_SKIP_TAGS); if (dnm == NULL) { fprintf(stderr, "Couldn't create DNM\n"); exit(1); } if (!text_is_english(my_tc, dnm->plaintext, dnm->size_plaintext)) { //isn't primarily english printf("%s\n", filename); } free_DNM(dnm); xmlFreeDoc(doc); return 0; }
static void setup(void) { setup_fixture_path(); read_document("fixtures/complete_tag.atom"); system("rm -Rf /tmp/valid-copy && cp -R fixtures/valid /tmp/valid-copy && chmod -R 755 /tmp/valid-copy"); item_cache_create(&item_cache, "/tmp/valid-copy", &item_cache_options); tagger = build_tagger(document, item_cache); train_tagger(tagger, item_cache); tagger->probability_function = &probability_function; assert_equal(TAGGER_TRAINED, tagger->state); random_background = new_pool(); }
void create_corpus_index(const char* corpus_file){ // Create the index to be used for analysis std::cout << "Loading corpus using files listed in " << corpus_file << std::endl; std::fstream corpus (corpus_file, std::fstream::in); std::istream_iterator<std::string> corpus_iterator(corpus); std::for_each(corpus_iterator, std::istream_iterator<std::string>(), [](const std::string& doc){ read_document(doc, corpus_index); }); std::cout << "Loaded corpus of " << corpus_index.size() << " words from " << num_docs << " file(s)." << std::endl; }
void run_slave(int myrank) { void *my_tc = llamapun_textcat_Init(); char filename[FILE_NAME_SIZE]; char message[RETURN_MESSAGE_SIZE]; MPI_Status status; while (1) { MPI_Recv(&filename, FILE_NAME_SIZE, MPI_CHAR, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status); if (status.MPI_TAG == 0) { printf("%2d - exiting\n", myrank); break; } else if (status.MPI_TAG == 1) { //do the actual job //printf("%2d - %s\n", myrank, filename); xmlDoc *doc = read_document(filename); if (doc == NULL) { snprintf(message, RETURN_MESSAGE_SIZE, "Couldn't load document %s\n", filename); MPI_Send(message, RETURN_MESSAGE_SIZE, MPI_CHAR, /*dest = */ 0, /*tag = message */ 1, MPI_COMM_WORLD); } dnmPtr dnm = create_DNM(xmlDocGetRootElement(doc), DNM_SKIP_TAGS); if (dnm == NULL) { fprintf(stderr, "%2d - Couldn't create DNM - exiting\n", myrank); exit(1); } char *result = textcat_Classify(my_tc, dnm->plaintext, dnm->size_plaintext); if (strncmp(result, "[english]", strlen("[english]"))) { //isn't primarily english snprintf(message, RETURN_MESSAGE_SIZE, "%s\t%s\n", filename, result); MPI_Send(message, RETURN_MESSAGE_SIZE, MPI_CHAR, /*dest = */ 0, /*tag = message */ 1, MPI_COMM_WORLD); } else { snprintf(message, RETURN_MESSAGE_SIZE, "%s\tenglish\n", filename); printf("%2d - %s", myrank, message); MPI_Send(message, RETURN_MESSAGE_SIZE, MPI_CHAR, /*dest = */ 0, /*tag = nothing special */ 0, MPI_COMM_WORLD); } //clean up free_DNM(dnm); xmlFreeDoc(doc); } else { fprintf(stderr, "%2d - Error: Unkown tag: %d - exiting\n", myrank, status.MPI_TAG); break; } } //clean up textcat_Done(my_tc); xmlCleanupParser(); }
static void setup(void) { setup_fixture_path(); read_document("fixtures/complete_tag.atom"); random_background = new_pool(); system("rm -Rf /tmp/valid-copy && cp -R fixtures/valid /tmp/valid-copy && chmod -R 755 /tmp/valid-copy"); item_cache_create(&item_cache, "/tmp/valid-copy", &item_cache_options); tagger = build_tagger(document, item_cache); train_tagger(tagger, item_cache); tagger->probability_function = &naive_bayes_probability; tagger->classification_function = &mock_classify; precompute_tagger(tagger, random_background); assert_equal(TAGGER_PRECOMPUTED, tagger->state); classified_item = NULL; int freeit; item = item_cache_fetch_item(item_cache, (unsigned char*) "urn:peerworks.org:entry#709254", &freeit); }
int main(int argc, char const *args[]) { if (argc != 2) { printf("Please provide a file name as an argument\n"); exit(1); } xmlDocPtr document = read_document(args[1]); init_document_loader(); int b = with_words_at_xpath(print_words_of_paragraph, document, paragraph_xpath, /* logfile = */ stderr, WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS | WORDS_MARK_END_OF_SENTENCE, DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES); if (!b) { printf("USE RELAXED PARAGRAPH\n"); b = with_words_at_xpath(print_words_of_paragraph, document, relaxed_paragraph_xpath, /* logfile = */ stderr, WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS | WORDS_MARK_END_OF_SENTENCE, DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES); } xmlFreeDoc(document); close_document_loader(); return 0; }
int main(int argc, char * argv[]){ int i = 1; const char * k_mixture_model_filename = NULL; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ g_train_pi_gram = false; } else if ( strcmp("--maximum-occurs-allowed", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } g_maximum_occurs = atoi(argv[i]); } else if ( strcmp("--maximum-increase-rates-allowed", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } g_maximum_increase_rates = atof(argv[i]); } else if ( strcmp("--k-mixture-model-file", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } k_mixture_model_filename = argv[i]; } else { break; } ++i; } PhraseLargeTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); while ( i < argc ){ const char * filename = argv[i]; FILE * document = fopen(filename, "r"); if ( NULL == document ){ int err_saved = errno; fprintf(stderr, "can't open file: %s.\n", filename); fprintf(stderr, "error:%s.\n", strerror(err_saved)); exit(err_saved); } HashofDocument hash_of_document = g_hash_table_new (g_direct_hash, g_direct_equal); HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); assert(read_document(&phrase_table, &phrase_index, document, hash_of_document, hash_of_unigram)); fclose(document); document = NULL; GHashTableIter iter; gpointer key, value; /* train the document, and convert it to k mixture model. */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { phrase_token_t token1 = GPOINTER_TO_UINT(key); train_second_word(hash_of_unigram, &bigram, hash_of_document, token1); } KMixtureModelMagicHeader magic_header; assert(bigram.get_magic_header(magic_header)); magic_header.m_N ++; assert(bigram.set_magic_header(magic_header)); post_processing_unigram(&bigram, hash_of_unigram); /* free resources of g_hash_of_document */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { HashofSecondWord second_word = (HashofSecondWord) value; g_hash_table_iter_steal(&iter); g_hash_table_unref(second_word); } g_hash_table_unref(hash_of_document); hash_of_document = NULL; g_hash_table_unref(hash_of_unigram); hash_of_unigram = NULL; ++i; } return 0; }
//SLAVE process void run_slave(int myrank) { char filename[FILE_NAME_SIZE]; snprintf(filename, FILE_NAME_SIZE, "/tmp/llamapun_inp_%d.txt", myrank); result_file = fopen(filename, "w"); if (result_file == NULL) { fprintf(stderr, "%2d - Couldn't create %s (fatal)\n", myrank, filename); exit(1); } /* Load pre-computed corpus IDF scores */ json_object* idf_json = read_json("idf.json"); idf = json_to_score_hash(idf_json); json_object_put(idf_json); /* Map common words to bin positions */ word_to_bin = frequencies_hash_to_bins(idf); MPI_Status status; init_document_loader(); while (1) { MPI_Recv(&filename, FILE_NAME_SIZE, MPI_CHAR, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status); if (status.MPI_TAG == 0) { printf("%2d - exiting\n", myrank); break; } else if (status.MPI_TAG == 1) { printf("%2d - %s\n", myrank, filename); //do the actual job xmlDoc *document = read_document(filename); if (document == NULL) { fprintf(stderr, "%2d - Couldn't load document %s\n", myrank, filename); } unicode_normalize_dom(document); prepare_preprocessing(); int b = with_words_at_xpath(pre_process_paragraph, document, paragraph_xpath, /* logfile = */ stderr, WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS, /* | WORDS_MARK_END_OF_SENTENCE, */ DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES); if (!b) { with_words_at_xpath(pre_process_paragraph, document, relaxed_paragraph_xpath, /* logfile = */ stderr, WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS, DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES); in_between_processing(); with_words_at_xpath(process_paragraph, document, relaxed_paragraph_xpath, /* logfile = */ stderr, WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS, DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES); } else { in_between_processing(); with_words_at_xpath(process_paragraph, document, paragraph_xpath, /* logfile = */ stderr, WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS, /* | WORDS_MARK_END_OF_SENTENCE, */ DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES); } clean_after_processing(); xmlFreeDoc(document); //request new document int i = 0; MPI_Send(&i, 1, MPI_INT, /*dest = */ 0, /*tag = nothing special */ 0, MPI_COMM_WORLD); } else { fprintf(stderr, "%2d - Error: Unkown tag: %d - exiting\n", myrank, status.MPI_TAG); break; } } //clean up close_document_loader(); fclose(result_file); free_score_hash(idf); }
int main(int argc, char * argv[]){ int i = 1; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- generate k mixture model"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } SystemTableInfo2 system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } PhraseLargeTable3 phrase_table; phrase_table.attach(SYSTEM_PHRASE_INDEX, ATTACH_READONLY); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); while ( i < argc ){ const char * filename = argv[i]; FILE * document = fopen(filename, "r"); if ( NULL == document ){ int err_saved = errno; fprintf(stderr, "can't open file: %s.\n", filename); fprintf(stderr, "error:%s.\n", strerror(err_saved)); exit(err_saved); } HashofDocument hash_of_document = g_hash_table_new (g_direct_hash, g_direct_equal); HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); assert(read_document(&phrase_table, &phrase_index, document, hash_of_document, hash_of_unigram)); fclose(document); document = NULL; GHashTableIter iter; gpointer key, value; /* train the document, and convert it to k mixture model. */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { phrase_token_t token1 = GPOINTER_TO_UINT(key); train_second_word(hash_of_unigram, &bigram, hash_of_document, token1); } KMixtureModelMagicHeader magic_header; assert(bigram.get_magic_header(magic_header)); magic_header.m_N ++; assert(bigram.set_magic_header(magic_header)); post_processing_unigram(&bigram, hash_of_unigram); /* free resources of g_hash_of_document */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { HashofSecondWord second_word = (HashofSecondWord) value; g_hash_table_iter_steal(&iter); g_hash_table_unref(second_word); } g_hash_table_unref(hash_of_document); hash_of_document = NULL; g_hash_table_unref(hash_of_unigram); hash_of_unigram = NULL; ++i; } return 0; }
int process_file(const char *filename, const struct stat *status, int type) { if (type != FTW_F) return 0; //Not a file file_counter++; //if (file_counter > 100) { return 0; } // Limit for development fprintf(stderr, " Loading %s\n",filename); xmlDoc *doc = read_document(filename); if (doc == NULL) return 0; //error message printed by read_document /* 1. Normalize Unicode */ unicode_normalize_dom(doc); /* 2. Select paragraphs */ xmlXPathContextPtr xpath_context = xmlXPathNewContext(doc); if(xpath_context == NULL) { fprintf(stderr,"Error: unable to create new XPath context for %s\n",filename); xmlFreeDoc(doc); exit(1); } xmlXPathObjectPtr paragraphs_result = xmlXPathEvalExpression(paragraph_xpath,xpath_context); if ((paragraphs_result == NULL) || (paragraphs_result->nodesetval == NULL) || (paragraphs_result->nodesetval->nodeNr == 0)) { // Nothing to do if there's no math in the document // Clean up this try, before making a second one: if (paragraphs_result != NULL) { if (paragraphs_result->nodesetval != NULL) { free(paragraphs_result->nodesetval->nodeTab); free(paragraphs_result->nodesetval); } xmlFree(paragraphs_result); } // Try the relaxed version: document isn't using LaTeX's \section{}s, maybe it's TeX. paragraphs_result = xmlXPathEvalExpression(relaxed_paragraph_xpath,xpath_context); if ((paragraphs_result == NULL) || (paragraphs_result->nodesetval == NULL) || (paragraphs_result->nodesetval->nodeNr == 0)) { // We're really giving up here, probably empty document if (paragraphs_result != NULL) { if (paragraphs_result->nodesetval != NULL) { free(paragraphs_result->nodesetval->nodeTab); free(paragraphs_result->nodesetval); } xmlFree(paragraphs_result); } xmlXPathFreeContext(xpath_context); xmlFreeDoc(doc); return 0; } } xmlNodeSetPtr paragraph_nodeset = paragraphs_result->nodesetval; int para_index; /* Iterate over each paragraph: */ for (para_index=0; para_index < paragraph_nodeset->nodeNr; para_index++) { xmlNodePtr paragraph_node = paragraph_nodeset->nodeTab[para_index]; // Obtain NLP-friendly plain-text of the paragraph: // -- We want to skip tags, as we only are interested in word counts for terms in TF-IDF dnmPtr paragraph_dnm = create_DNM(paragraph_node, DNM_SKIP_TAGS); if (paragraph_dnm == NULL) { fprintf(stderr, "Couldn't create DNM for paragraph %d in document %s\n",para_index, filename); exit(1); } /* 3. For every paragraph, tokenize sentences: */ char* paragraph_text = paragraph_dnm->plaintext; dnmRanges sentences = tokenize_sentences(paragraph_text); /* 4. For every sentence, tokenize words */ int sentence_index = 0; for (sentence_index = 0; sentence_index < sentences.length; sentence_index++) { // Obtaining only the content words here, disregard stopwords and punctuation dnmRanges words = tokenize_words(paragraph_text, sentences.range[sentence_index], TOKENIZER_ALPHA_ONLY | TOKENIZER_FILTER_STOPWORDS); int word_index; for(word_index=0; word_index<words.length; word_index++) { char* word_string = plain_range_to_string(paragraph_text, words.range[word_index]); char* word_stem; morpha_stem(word_string, &word_stem); /* Ensure stemming is an invariant (tilings -> tiling -> tile -> tile) */ while (strcmp(word_string, word_stem) != 0) { free(word_string); word_string = word_stem; morpha_stem(word_string, &word_stem); } free(word_string); // Note: SENNA's tokenization has some features to keep in mind: // multi-symplectic --> "multi-" and "symplectic" // Birkhoff's --> "birkhoff" and "'s" // Add to the document frequency free(word_stem); } free(words.range); } free(sentences.range); free_DNM(paragraph_dnm); } free(paragraphs_result->nodesetval->nodeTab); free(paragraphs_result->nodesetval); xmlFree(paragraphs_result); xmlXPathFreeContext(xpath_context); xmlFreeDoc(doc); fprintf(stderr,"Completed document #%d\n",file_counter); return 0; }
void create_search_index(const char* search_file){ // Create the index of the search words read_document(search_file, search_index); num_docs--; }