示例#1
0
static bool setup_linkages(Sentence sent, extractor_t* pex,
                          fast_matcher_t* mchxt,
                          count_context_t* ctxt,
                          Parse_Options opts)
{
	bool overflowed = build_parse_set(pex, sent, mchxt, ctxt, sent->null_count, opts);
	print_time(opts, "Built parse set");

	if (overflowed && (1 < opts->verbosity))
	{
		err_ctxt ec = { sent };
		err_msgc(&ec, lg_Warn, "Count overflow.\n"
			"Considering a random subset of %zu of an unknown and large number of linkages\n",
			opts->linkage_limit);
	}

	if (sent->num_linkages_found == 0)
	{
		sent->num_linkages_alloced = 0;
		sent->num_linkages_post_processed = 0;
		sent->num_valid_linkages = 0;
		sent->lnkages = NULL;
		return overflowed;
	}

	sent->num_linkages_alloced =
		MIN(sent->num_linkages_found, (int) opts->linkage_limit);

	/* Now actually malloc the array in which we will process linkages. */
	/* We may have been called before, e.g. this might be a panic parse,
	 * and the linkages array may still be there from last time.
	 * XXX free_linkages() zeros sent->num_linkages_found. */
	if (sent->lnkages) free_linkages(sent);
	sent->lnkages = linkage_array_new(sent->num_linkages_alloced);

	return overflowed;
}
示例#2
0
static void select_linkages(Sentence sent, fast_matcher_t* mchxt,
                            count_context_t* ctxt,
                            Parse_Options opts)
{
	size_t in;
	size_t N_linkages_found, N_linkages_alloced;

	bool overflowed = build_parse_set(sent, mchxt, ctxt, sent->null_count, opts);
	print_time(opts, "Built parse set");

	if (overflowed && (1 < opts->verbosity))
	{
		err_ctxt ec;
		ec.sent = sent;
		err_msg(&ec, Warn, "Warning: Count overflow.\n"
		  "Considering a random subset of %zu of an unknown and large number of linkages\n",
			opts->linkage_limit);
	}
	N_linkages_found = sent->num_linkages_found;

	if (sent->num_linkages_found == 0)
	{
		sent->num_linkages_alloced = 0;
		sent->num_linkages_post_processed = 0;
		sent->num_valid_linkages = 0;
		sent->lnkages = NULL;
		return;
	}

	if (N_linkages_found > opts->linkage_limit)
	{
		N_linkages_alloced = opts->linkage_limit;
		if (opts->verbosity > 1)
		{
			err_ctxt ec;
			ec.sent = sent;
			err_msg(&ec, Warn,
			    "Warning: Considering a random subset of %zu of %zu linkages\n",
			    N_linkages_alloced, N_linkages_found);
		}
	}
	else
	{
		N_linkages_alloced = N_linkages_found;
	}

	/* Now actually malloc the array in which we will process linkages. */
	/* We may have been called before, e.g this might be a panic parse,
	 * and the linkages array may still be there from last time.
	 * XXX free_linkages() zeros sent->num_linkages_found. */
	if (sent->lnkages) free_linkages(sent);
	sent->num_linkages_found = N_linkages_found;
	sent->lnkages = linkage_array_new(N_linkages_alloced);

	/* Generate an array of linkage indices to examine */
	if (overflowed)
	{
		/* The negative index means that a random subset of links
		 * will be picked later on, in extract_links(). */
		for (in=0; in < N_linkages_alloced; in++)
		{
			sent->lnkages[in].lifo.index = -(in+1);
		}
	}
	else if (N_linkages_found == N_linkages_alloced)
	{
		for (in=0; in<N_linkages_alloced; in++)
			sent->lnkages[in].lifo.index = in;
	}
	else
	{
		/* There are more linkages found than we can handle */
		/* Pick a (quasi-)uniformly distributed random subset. */
		if (opts->repeatable_rand)
			sent->rand_state = N_linkages_found + sent->length;

		for (in=0; in<N_linkages_alloced; in++)
		{
			size_t block_bottom, block_top;
			double frac = (double) N_linkages_found;

			frac /= (double) N_linkages_alloced;
			block_bottom = (int) (((double) in) * frac);
			block_top = (int) (((double) (in+1)) * frac);
			sent->lnkages[in].lifo.index = block_bottom +
				(rand_r(&sent->rand_state) % (block_top-block_bottom));
		}
	}

	sent->num_linkages_alloced = N_linkages_alloced;
	/* Later, we subtract the number of invalid linkages */
	sent->num_valid_linkages = N_linkages_alloced;
}
示例#3
0
文件: api.c 项目: mclumd/Alfred
void post_process_linkages(Sentence sent, Parse_Options opts) {

    int *indices;
    int in, block_bottom, block_top;
    int N_linkages_found, N_linkages_alloced;
    int N_linkages_post_processed, N_valid_linkages;
    int overflowed, only_canonical_allowed;
    double denom;
    Linkage_info *link_info;
    int canonical;
    
    free_post_processing(sent);   

    overflowed = build_parse_set(sent, sent->null_count, opts);
    print_time(opts, "Built parse set");

    if (overflowed) {
	/* We know that sent->num_linkages_found is bogus, possibly negative */
        sent->num_linkages_found = opts->linkage_limit;
	if (opts->verbosity > 1) 
	  fprintf(stdout,
		  "Warning: Count overflow.\n"
		  "Considering a random subset of %d of an unknown and large number of linkages\n",
		  opts->linkage_limit);
    }
    N_linkages_found = sent->num_linkages_found;
    
    if (sent->num_linkages_found == 0) {
	sent->num_linkages_alloced = 0;
	sent->num_linkages_post_processed = 0;
	sent->num_valid_linkages = 0;
	sent->link_info = NULL;
	return;
    }
    
    if (N_linkages_found > opts->linkage_limit) 
      {
	N_linkages_alloced = opts->linkage_limit;
	if (opts->verbosity > 1) 
	  fprintf(stdout,
		  "Warning: Considering a random subset of %d of %d linkages\n",
		  N_linkages_alloced, N_linkages_found);
      } 
    else N_linkages_alloced = N_linkages_found;
    
    link_info=(Linkage_info *)xalloc(N_linkages_alloced * sizeof(Linkage_info));
    N_linkages_post_processed = N_valid_linkages = 0;

    /* generate an array of linkage indices to examine */
    indices = (int *) xalloc(N_linkages_alloced * sizeof(int));
    if (overflowed) {
	for (in=0; in<N_linkages_alloced; in++) {
	    indices[in] = -(in+1); 
	}
    }
    else {
	my_random_initialize(N_linkages_found + sent->length);
	for (in=0; in<N_linkages_alloced; in++) {
	    denom = (double) N_linkages_alloced;
	    block_bottom = (int) (((double)in*(double) N_linkages_found)/denom);
	    block_top = (int) (((double)(in+1)*(double)N_linkages_found)/denom);
	    indices[in] = block_bottom + (my_random() % (block_top-block_bottom));
	}
	my_random_finalize();
    }

    only_canonical_allowed = (!(overflowed || (N_linkages_found > 2*opts->linkage_limit)));
    /* When we're processing only a small subset of the linkages, don't worry
       about restricting the set we consider to be canonical ones.  In the extreme
       case where we are only generating 1 in a million linkages, it's very unlikely
       that we'll hit two symmetric variants of the same linkage anyway. */
    
    /* (optional) first pass: just visit the linkages */ 
    /* The purpose of these two passes is to make the post-processing more
       efficient.  Because (hopefully) by the time you do the real work
       in the 2nd pass you've pruned the relevant rule set in the first pass. */
    if (sent->length >= opts->twopass_length) {
	for (in=0; (in < N_linkages_alloced) && 
	           (!resources_exhausted(opts->resources)); in++) {
	    extract_links(indices[in], sent->null_count, sent->parse_info);
	    if (set_has_fat_down(sent)) {
		if (only_canonical_allowed && !is_canonical_linkage(sent)) continue;
		analyze_fat_linkage(sent, opts, PP_FIRST_PASS);
	    } 
	    else {
		analyze_thin_linkage(sent, opts, PP_FIRST_PASS);
	    }
	}
    }
    
    /* second pass: actually perform post-processing */
    for (in=0; (in < N_linkages_alloced) && 
	       (!resources_exhausted(opts->resources)); in++) {
	extract_links(indices[in], sent->null_count, sent->parse_info);
	if (set_has_fat_down(sent)) {
	    canonical = is_canonical_linkage(sent);
	    if (only_canonical_allowed && !canonical) continue;
	    link_info[N_linkages_post_processed] = 
		analyze_fat_linkage(sent, opts, PP_SECOND_PASS);
	    link_info[N_linkages_post_processed].fat = TRUE;
	    link_info[N_linkages_post_processed].canonical = canonical;
	} 
	else {
	    link_info[N_linkages_post_processed] = 
		analyze_thin_linkage(sent, opts, PP_SECOND_PASS);
	    link_info[N_linkages_post_processed].fat = FALSE;
	    link_info[N_linkages_post_processed].canonical = TRUE;
	}
	if (link_info[N_linkages_post_processed].N_violations==0)
	    N_valid_linkages++;
	link_info[N_linkages_post_processed].index = indices[in];
	N_linkages_post_processed++;
    }

    print_time(opts, "Postprocessed all linkages"); 
    qsort((void *)link_info, N_linkages_post_processed, sizeof(Linkage_info),
	  (int (*)(const void *, const void *)) opts->cost_model.compare_fn);
    
    if (!resources_exhausted(opts->resources)) {
	assert(! ((N_linkages_post_processed == 0) && 
		  (N_linkages_found > 0) && 
		  (N_linkages_found < opts->linkage_limit)),
	       "None of the linkages is canonical");
    }

    if (opts->verbosity > 1) {
	fprintf(stdout, "%d of %d linkages with no P.P. violations\n", 
		N_valid_linkages, N_linkages_post_processed);
    }
	
    print_time(opts, "Sorted all linkages");

    sent->num_linkages_alloced = N_linkages_alloced;
    sent->num_linkages_post_processed = N_linkages_post_processed;
    sent->num_valid_linkages = N_valid_linkages;
    sent->link_info = link_info;

    xfree(indices, N_linkages_alloced * sizeof(int));
    /*if(N_valid_linkages == 0) free_andlists(sent); */
}