int read_doc_and_call_function(const char *filename, const struct stat *status, int type) {
  UNUSED(status);
  if (type != FTW_F) return 0;   //not a file
  xmlDoc *document = read_document(filename);

  //if desired, check language or normalize unicode
  if (TRAVERSAL_PARAMETERS & LOADER_CHECK_LANGUAGE) {
    dnmPtr dnm = create_DNM(xmlDocGetRootElement(document), DNM_SKIP_TAGS);
    if (dnm == NULL) {
      fprintf(stderr, "Couldn't create DNM of %s (fatal)\n", filename);
      exit(1);
    }
    if (!text_is_english(TEXT_CAT_HANDLE, dnm->plaintext, dnm->size_plaintext)) {  //if it isn't primarily english
      fprintf(TRAVERSAL_LOG_FILE, "Dismissing %s (appears not to be in English)\n", filename);
      xmlFreeDoc(document);
      return 0;
    }
    free_DNM(dnm);
  }
  if (TRAVERSAL_PARAMETERS & LOADER_NORMALIZE_UNICODE)
    unicode_normalize_dom(document);

  //call the function passed by the user
  int ret_val = FUNCTION_FOR_DOCS(document, filename);

  xmlFreeDoc(document);
  return ret_val;
}
int parse(const char *filename, const struct stat *status, int type) {
  if (type != FTW_F) return 0; //Not a file
  UNUSED(status);

  fprintf(stderr, "%s\n", filename);

  xmlDoc *doc = read_document(filename);

  if (doc == NULL) return 0;   //error message printed by read_document

  dnmPtr dnm = create_DNM(xmlDocGetRootElement(doc), DNM_SKIP_TAGS);

  if (dnm == NULL) {
    fprintf(stderr, "Couldn't create DNM\n");
    exit(1);
  }

  if (!text_is_english(my_tc, dnm->plaintext, dnm->size_plaintext)) {  //isn't primarily english
    printf("%s\n", filename);
  }

  free_DNM(dnm);
  xmlFreeDoc(doc);
  return 0;
}
示例#3
0
static void setup(void) {
  setup_fixture_path();
  read_document("fixtures/complete_tag.atom");
  system("rm -Rf /tmp/valid-copy && cp -R fixtures/valid /tmp/valid-copy && chmod -R 755 /tmp/valid-copy");
  item_cache_create(&item_cache, "/tmp/valid-copy", &item_cache_options);
  tagger = build_tagger(document, item_cache);
  train_tagger(tagger, item_cache);
  tagger->probability_function = &probability_function;
  assert_equal(TAGGER_TRAINED, tagger->state);
  random_background = new_pool();
}
示例#4
0
void create_corpus_index(const char* corpus_file){
  // Create the index to be used for analysis
  std::cout << "Loading corpus using files listed in " << corpus_file << std::endl;
  std::fstream corpus (corpus_file, std::fstream::in);
  std::istream_iterator<std::string> corpus_iterator(corpus);
  std::for_each(corpus_iterator, std::istream_iterator<std::string>(), [](const std::string& doc){
      read_document(doc, corpus_index);
    });
  std::cout << "Loaded corpus of " << corpus_index.size() 
	    << " words from " << num_docs << " file(s)." 
	    << std::endl;
}
void run_slave(int myrank) {
	void *my_tc = llamapun_textcat_Init();
	char filename[FILE_NAME_SIZE];
	char message[RETURN_MESSAGE_SIZE];
	MPI_Status status;
	while (1) {
		MPI_Recv(&filename, FILE_NAME_SIZE, MPI_CHAR, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
		if (status.MPI_TAG == 0) {
			printf("%2d - exiting\n", myrank);
			break;
		} else if (status.MPI_TAG == 1) {
			//do the actual job

			//printf("%2d - %s\n", myrank, filename);

			xmlDoc *doc = read_document(filename);
			if (doc == NULL) {
				snprintf(message, RETURN_MESSAGE_SIZE, "Couldn't load document %s\n", filename);
				MPI_Send(message, RETURN_MESSAGE_SIZE, MPI_CHAR, /*dest = */ 0, /*tag = message */ 1, MPI_COMM_WORLD);
			}
			
			dnmPtr dnm = create_DNM(xmlDocGetRootElement(doc), DNM_SKIP_TAGS);

			if (dnm == NULL) {
				fprintf(stderr, "%2d - Couldn't create DNM - exiting\n", myrank);
				exit(1);
			}
			char *result = textcat_Classify(my_tc, dnm->plaintext, dnm->size_plaintext);
			if (strncmp(result, "[english]", strlen("[english]"))) {  //isn't primarily english
				snprintf(message, RETURN_MESSAGE_SIZE, "%s\t%s\n", filename, result);
				MPI_Send(message, RETURN_MESSAGE_SIZE, MPI_CHAR, /*dest = */ 0, /*tag = message */ 1, MPI_COMM_WORLD);
			}
			else {
				snprintf(message, RETURN_MESSAGE_SIZE, "%s\tenglish\n", filename);
				printf("%2d - %s", myrank, message);
				MPI_Send(message, RETURN_MESSAGE_SIZE, MPI_CHAR, /*dest = */ 0, /*tag = nothing special */ 0, MPI_COMM_WORLD);
			}

			//clean up
			free_DNM(dnm);
			xmlFreeDoc(doc);
		} else {
			fprintf(stderr, "%2d - Error: Unkown tag: %d - exiting\n", myrank, status.MPI_TAG);
			break;
		}
	}
	//clean up
	textcat_Done(my_tc);
	xmlCleanupParser();
}
示例#6
0
static void setup(void) {
  setup_fixture_path();
  read_document("fixtures/complete_tag.atom");
  random_background = new_pool();
  system("rm -Rf /tmp/valid-copy && cp -R fixtures/valid /tmp/valid-copy && chmod -R 755 /tmp/valid-copy");
  item_cache_create(&item_cache, "/tmp/valid-copy", &item_cache_options);

  tagger = build_tagger(document, item_cache);
  train_tagger(tagger, item_cache);
  tagger->probability_function = &naive_bayes_probability;
  tagger->classification_function = &mock_classify;
  precompute_tagger(tagger, random_background);
  assert_equal(TAGGER_PRECOMPUTED, tagger->state);

  classified_item = NULL;
  int freeit;
  item = item_cache_fetch_item(item_cache, (unsigned char*) "urn:peerworks.org:entry#709254", &freeit);
}
int main(int argc, char const *args[]) {
	if (argc != 2) {
		printf("Please provide a file name as an argument\n");
		exit(1);
	}

	xmlDocPtr document = read_document(args[1]);
	init_document_loader();

	int b = with_words_at_xpath(print_words_of_paragraph, document, paragraph_xpath, /* logfile = */ stderr,
					WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS | WORDS_MARK_END_OF_SENTENCE,
					DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES);
	if (!b) {
		printf("USE RELAXED PARAGRAPH\n");
		b = with_words_at_xpath(print_words_of_paragraph, document, relaxed_paragraph_xpath, /* logfile = */ stderr,
					WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS | WORDS_MARK_END_OF_SENTENCE,
					DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES);
	}

	xmlFreeDoc(document);
	close_document_loader();
	return 0;
}
示例#8
0
int main(int argc, char * argv[]){
    int i = 1;
    const char * k_mixture_model_filename = NULL;

    setlocale(LC_ALL, "");
    while ( i < argc ){
        if ( strcmp("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
            g_train_pi_gram = false;
        } else if ( strcmp("--maximum-occurs-allowed", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            g_maximum_occurs = atoi(argv[i]);
        } else if ( strcmp("--maximum-increase-rates-allowed", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            g_maximum_increase_rates = atof(argv[i]);
        } else if ( strcmp("--k-mixture-model-file", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            k_mixture_model_filename = argv[i];
        } else {
            break;
        }
        ++i;
    }

    PhraseLargeTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);

    while ( i < argc ){
        const char * filename = argv[i];
        FILE * document = fopen(filename, "r");
        if ( NULL == document ){
            int err_saved = errno;
            fprintf(stderr, "can't open file: %s.\n", filename);
            fprintf(stderr, "error:%s.\n", strerror(err_saved));
            exit(err_saved);
        }

        HashofDocument hash_of_document = g_hash_table_new
            (g_direct_hash, g_direct_equal);
        HashofUnigram hash_of_unigram = g_hash_table_new
            (g_direct_hash, g_direct_equal);

        assert(read_document(&phrase_table, &phrase_index, document,
                             hash_of_document, hash_of_unigram));
        fclose(document);
        document = NULL;

        GHashTableIter iter;
        gpointer key, value;

        /* train the document, and convert it to k mixture model. */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            phrase_token_t token1 = GPOINTER_TO_UINT(key);
            train_second_word(hash_of_unigram, &bigram,
                              hash_of_document, token1);
        }

        KMixtureModelMagicHeader magic_header;
        assert(bigram.get_magic_header(magic_header));
        magic_header.m_N ++;
        assert(bigram.set_magic_header(magic_header));

        post_processing_unigram(&bigram, hash_of_unigram);

        /* free resources of g_hash_of_document */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            HashofSecondWord second_word = (HashofSecondWord) value;
            g_hash_table_iter_steal(&iter);
            g_hash_table_unref(second_word);
        }
        g_hash_table_unref(hash_of_document);
        hash_of_document = NULL;

        g_hash_table_unref(hash_of_unigram);
        hash_of_unigram = NULL;

        ++i;
    }

    return 0;
}
//SLAVE process
void run_slave(int myrank) {
	char filename[FILE_NAME_SIZE];
	snprintf(filename, FILE_NAME_SIZE, "/tmp/llamapun_inp_%d.txt", myrank);
	result_file = fopen(filename, "w");
	if (result_file == NULL) {
		fprintf(stderr, "%2d - Couldn't create %s (fatal)\n", myrank, filename);
		exit(1);
	}

	/* Load pre-computed corpus IDF scores */
	json_object* idf_json = read_json("idf.json");
	idf = json_to_score_hash(idf_json);
	json_object_put(idf_json);
	/* Map common words to bin positions */
	word_to_bin = frequencies_hash_to_bins(idf);

	MPI_Status status;
	init_document_loader();

	while (1) {
		MPI_Recv(&filename, FILE_NAME_SIZE, MPI_CHAR, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
		if (status.MPI_TAG == 0) {
			printf("%2d - exiting\n", myrank);
			break;
		} else if (status.MPI_TAG == 1) {
			printf("%2d - %s\n", myrank, filename);
			//do the actual job
			xmlDoc *document = read_document(filename);
			if (document == NULL) {
				fprintf(stderr, "%2d - Couldn't load document %s\n", myrank, filename);
			}
			unicode_normalize_dom(document);
			prepare_preprocessing();
			int b = with_words_at_xpath(pre_process_paragraph, document, paragraph_xpath, /* logfile = */ stderr,
				WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS, /* | WORDS_MARK_END_OF_SENTENCE, */
				DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES);
			if (!b) {
				with_words_at_xpath(pre_process_paragraph, document, relaxed_paragraph_xpath, /* logfile = */ stderr,
					WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS,
					DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES);
				in_between_processing();
				with_words_at_xpath(process_paragraph, document, relaxed_paragraph_xpath, /* logfile = */ stderr,
					WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS,
					DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES);
			} else {
				in_between_processing();
				with_words_at_xpath(process_paragraph, document, paragraph_xpath, /* logfile = */ stderr,
					WORDS_NORMALIZE_WORDS | WORDS_STEM_WORDS, /* | WORDS_MARK_END_OF_SENTENCE, */
					DNM_NORMALIZE_TAGS | DNM_IGNORE_LATEX_NOTES);
			}
			clean_after_processing();

			xmlFreeDoc(document);

			//request new document
			int i = 0;
			MPI_Send(&i, 1, MPI_INT, /*dest = */ 0, /*tag = nothing special */ 0, MPI_COMM_WORLD);
		} else {
			fprintf(stderr, "%2d - Error: Unkown tag: %d - exiting\n", myrank, status.MPI_TAG);
			break;
		}
	}
	//clean up
	close_document_loader();
	fclose(result_file);
	free_score_hash(idf);
}
示例#10
0
int main(int argc, char * argv[]){
    int i = 1;

    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- generate k mixture model");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    SystemTableInfo2 system_table_info;

    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    PhraseLargeTable3 phrase_table;
    phrase_table.attach(SYSTEM_PHRASE_INDEX, ATTACH_READONLY);

    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_default_tables();

    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);

    while ( i < argc ){
        const char * filename = argv[i];
        FILE * document = fopen(filename, "r");
        if ( NULL == document ){
            int err_saved = errno;
            fprintf(stderr, "can't open file: %s.\n", filename);
            fprintf(stderr, "error:%s.\n", strerror(err_saved));
            exit(err_saved);
        }

        HashofDocument hash_of_document = g_hash_table_new
            (g_direct_hash, g_direct_equal);
        HashofUnigram hash_of_unigram = g_hash_table_new
            (g_direct_hash, g_direct_equal);

        assert(read_document(&phrase_table, &phrase_index, document,
                             hash_of_document, hash_of_unigram));
        fclose(document);
        document = NULL;

        GHashTableIter iter;
        gpointer key, value;

        /* train the document, and convert it to k mixture model. */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            phrase_token_t token1 = GPOINTER_TO_UINT(key);
            train_second_word(hash_of_unigram, &bigram,
                              hash_of_document, token1);
        }

        KMixtureModelMagicHeader magic_header;
        assert(bigram.get_magic_header(magic_header));
        magic_header.m_N ++;
        assert(bigram.set_magic_header(magic_header));

        post_processing_unigram(&bigram, hash_of_unigram);

        /* free resources of g_hash_of_document */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            HashofSecondWord second_word = (HashofSecondWord) value;
            g_hash_table_iter_steal(&iter);
            g_hash_table_unref(second_word);
        }
        g_hash_table_unref(hash_of_document);
        hash_of_document = NULL;

        g_hash_table_unref(hash_of_unigram);
        hash_of_unigram = NULL;

        ++i;
    }

    return 0;
}
示例#11
0
int process_file(const char *filename, const struct stat *status, int type) {
  if (type != FTW_F) return 0; //Not a file
  file_counter++;
  //if (file_counter > 100) { return 0; } // Limit for development
  fprintf(stderr, " Loading %s\n",filename);
  xmlDoc *doc = read_document(filename);
  if (doc == NULL) return 0;   //error message printed by read_document
  /* 1. Normalize Unicode */
  unicode_normalize_dom(doc);
  /* 2. Select paragraphs */
  xmlXPathContextPtr xpath_context = xmlXPathNewContext(doc);
  if(xpath_context == NULL) {
    fprintf(stderr,"Error: unable to create new XPath context for %s\n",filename);
    xmlFreeDoc(doc);
    exit(1); }
  xmlXPathObjectPtr paragraphs_result = xmlXPathEvalExpression(paragraph_xpath,xpath_context);
  if ((paragraphs_result == NULL) || (paragraphs_result->nodesetval == NULL) || (paragraphs_result->nodesetval->nodeNr == 0)) { // Nothing to do if there's no math in the document
    // Clean up this try, before making a second one:
    if (paragraphs_result != NULL) {
      if (paragraphs_result->nodesetval != NULL) {
        free(paragraphs_result->nodesetval->nodeTab);
        free(paragraphs_result->nodesetval); }
      xmlFree(paragraphs_result); }

    // Try the relaxed version: document isn't using LaTeX's \section{}s, maybe it's TeX.
    paragraphs_result = xmlXPathEvalExpression(relaxed_paragraph_xpath,xpath_context);
    if ((paragraphs_result == NULL) || (paragraphs_result->nodesetval == NULL) || (paragraphs_result->nodesetval->nodeNr == 0)) {
      // We're really giving up here, probably empty document
      if (paragraphs_result != NULL) {
        if (paragraphs_result->nodesetval != NULL) {
          free(paragraphs_result->nodesetval->nodeTab);
          free(paragraphs_result->nodesetval); }
        xmlFree(paragraphs_result); }
      xmlXPathFreeContext(xpath_context);
      xmlFreeDoc(doc);
      return 0; } }
  xmlNodeSetPtr paragraph_nodeset = paragraphs_result->nodesetval;
  int para_index;

  /* Iterate over each paragraph: */
  for (para_index=0; para_index < paragraph_nodeset->nodeNr; para_index++) {
    xmlNodePtr paragraph_node = paragraph_nodeset->nodeTab[para_index];
    // Obtain NLP-friendly plain-text of the paragraph:
    // -- We want to skip tags, as we only are interested in word counts for terms in TF-IDF
    dnmPtr paragraph_dnm = create_DNM(paragraph_node, DNM_SKIP_TAGS);
    if (paragraph_dnm == NULL) {
      fprintf(stderr, "Couldn't create DNM for paragraph %d in document %s\n",para_index, filename);
      exit(1);
    }
    /* 3. For every paragraph, tokenize sentences: */
    char* paragraph_text = paragraph_dnm->plaintext;
    dnmRanges sentences = tokenize_sentences(paragraph_text);
    /* 4. For every sentence, tokenize words */
    int sentence_index = 0;
    for (sentence_index = 0; sentence_index < sentences.length; sentence_index++) {
      // Obtaining only the content words here, disregard stopwords and punctuation
      dnmRanges words = tokenize_words(paragraph_text, sentences.range[sentence_index],
                                       TOKENIZER_ALPHA_ONLY | TOKENIZER_FILTER_STOPWORDS);
      int word_index;
      for(word_index=0; word_index<words.length; word_index++) {
        char* word_string = plain_range_to_string(paragraph_text, words.range[word_index]);
        char* word_stem;
        morpha_stem(word_string, &word_stem);
        /* Ensure stemming is an invariant (tilings -> tiling -> tile -> tile) */
        while (strcmp(word_string, word_stem) != 0) {
          free(word_string);
          word_string = word_stem;
          morpha_stem(word_string, &word_stem);
        }
        free(word_string);
        // Note: SENNA's tokenization has some features to keep in mind:
        //  multi-symplectic --> "multi-" and "symplectic"
        //  Birkhoff's       --> "birkhoff" and "'s"
        // Add to the document frequency
        free(word_stem);
      }
      free(words.range);
    }
    free(sentences.range);
    free_DNM(paragraph_dnm);
  }
  free(paragraphs_result->nodesetval->nodeTab);
  free(paragraphs_result->nodesetval);
  xmlFree(paragraphs_result);
  xmlXPathFreeContext(xpath_context);
  xmlFreeDoc(doc);

  fprintf(stderr,"Completed document #%d\n",file_counter);
  return 0;
}
示例#12
0
void create_search_index(const char* search_file){
  // Create the index of the search words
  read_document(search_file, search_index);
  num_docs--;
}