예제 #1
0
/** 
 * This function generates a table, word_used[i][w], showing
 * whether each word w is used in each sublinkage i; if so,
 * the value for that cell of the table is 1.
 */
static void count_words_used(con_context_t *ctxt, Linkage linkage)
{
	int i, w, link, num_subl;

	num_subl = linkage->num_sublinkages;
	if(linkage->unionized==1 && num_subl>1) num_subl--;

	if (verbosity>=2)
		printf("Number of sublinkages = %d\n", num_subl);
	for (i=0; i<num_subl; i++) {
		for (w=0; w<linkage->num_words; w++) ctxt->word_used[i][w]=0;
		linkage->current=i;
		for (link=0; link<linkage_get_num_links(linkage); link++) {
			ctxt->word_used[i][linkage_get_link_lword(linkage, link)]=1;
			ctxt->word_used[i][linkage_get_link_rword(linkage, link)]=1;
		}
		if (verbosity>=2) {
			printf("Sublinkage %d: ", i);
			for (w=0; w<linkage->num_words; w++) {
				if (ctxt->word_used[i][w]==0) printf("0 ");
				if (ctxt->word_used[i][w]==1) printf("1 ");
			}
			printf("\n");
		}
	}
}
예제 #2
0
/**
 * Go through all the words. If a word is on the right end of
 * an S (or SF or SX), wordtype[w]=STYPE.  If it's also on the left end of a
 * Pg*b, I, PP, or Pv, wordtype[w]=PTYPE. If it's a question-word
 * used in an indirect question, wordtype[w]=QTYPE. If it's a
 * question-word determiner,  wordtype[w]=QDTYPE. Else wordtype[w]=NONE.
 * (This function is called once for each sublinkage.)
 */
static void generate_misc_word_info(con_context_t * ctxt, Linkage linkage)
{
	int l1, l2, w1, w2;
	const char * label1, * label2;

	for (w1=0; w1<linkage->num_words; w1++)
		ctxt->wordtype[w1]=NONE;

	for (l1=0; l1<linkage_get_num_links(linkage); l1++) {	
		w1=linkage_get_link_rword(linkage, l1);
		label1 = linkage_get_link_label(linkage, l1);
		if ((uppercompare(label1, "S")==0) ||
			(uppercompare(label1, "SX")==0) ||
			(uppercompare(label1, "SF")==0)) {
			ctxt->wordtype[w1] = STYPE;
			for (l2=0; l2<linkage_get_num_links(linkage); l2++) {
				w2=linkage_get_link_lword(linkage, l2);
				label2 = linkage_get_link_label(linkage, l2);
				if ((w1==w2) &&
					((post_process_match("Pg#b", label2)==1) ||
					 (uppercompare(label2, "I")==0) ||
					 (uppercompare(label2, "PP")==0) ||
					 (post_process_match("Pv", label2)==1))) {
					/* Pvf, Pgf? */
					ctxt->wordtype[w1] = PTYPE;
				}
			}
		}
		if (post_process_match("QI#d", label1)==1) {
			ctxt->wordtype[w1] = QTYPE;
			for (l2=0; l2<linkage_get_num_links(linkage); l2++) {
				w2=linkage_get_link_lword(linkage, l2);
				label2 = linkage_get_link_label(linkage, l2);
				if ((w1==w2) && (post_process_match("D##w", label2)==1)) {
					ctxt->wordtype[w1] = QDTYPE;
				}
			}
		}
		if (post_process_match("Mr", label1)==1) ctxt->wordtype[w1] = QDTYPE;
		if (post_process_match("MX#d", label1)==1) ctxt->wordtype[w1] = QDTYPE;
	}
}
예제 #3
0
파일: jni-client.c 프로젝트: dyne/AutOrg
/*
 * Class:      LinkGrammar
 * Method:     getLinkRWord
 * Signature: (I)I
 */
JNIEXPORT jint JNICALL
Java_org_linkgrammar_LinkGrammar_getLinkRWord(JNIEnv *env, jclass cls, jint i)
{
	per_thread_data *ptd = get_ptd(env, cls);
	return linkage_get_link_rword(ptd->linkage, i);
}
예제 #4
0
파일: nlp.cpp 프로젝트: PistiZ/amminadab
 SPOTriplets NLP::sentence2triplets ( const char* sentence )
 {
   // vector of triplets
   SPOTriplets triplets;

   #ifdef DEBUG
     std::cout << "The sentence: " << sentence << std::endl;
   #endif
   // creates a Sentence from the input char*
   Sentence sent = sentence_create ( sentence, dict_ );
   #ifdef DEBUG
     std::cout << "Sentence created" << std::endl;
   #endif
   // tokenizes the sentence
   sentence_split ( sent, parse_opts_ );
   #ifdef DEBUG
     std::cout << "Sentence splitted" << std::endl;
   #endif
   // searches for all possible linkages
   int num_linkages = sentence_parse ( sent, parse_opts_ );
   #ifdef DEBUG
     std::cout << "Sentence parsed" << std::endl;
     std::cout << "Number of linkages: " << num_linkages << std::endl;
   #endif

   // just one triplet
   SPOTriplet triplet;

   // if there is any linkage in the sentence
   if( num_linkages > 0 )
   {
     // create the linkage
     Linkage linkage = linkage_create ( 0, sent, parse_opts_ );

     #ifdef DEBUG
       // prints the sentence's diagram
       std::cout << "The diagram: " << std::endl;
       char *diagram = linkage_print_diagram(linkage, true, 800);
       std::cout << diagram << std::endl;
       linkage_free_diagram( diagram );
       // end print diagram
     #endif

     std::vector<std::string> labels;

     // 1. find the S_link
     // S* except there is an SJ* because then S* except Spx
     // two cases: there is SJ* and there is not SJ*

     // TODO: VJlp VJrp same as SJ but to predications
     // TODO: SFut SFst what the f**k?                                     ###FIXED###
     // TODO: His form was shining like the light not working              ###FIXED###
     // TODO: Car is mine not working                                      ###FIXED###
     // TODO: The little brown bear has eaten all of the honey not working ###FIXED###

     // REGEXES
     std::regex SJ_( "SJ.*" );
     std::regex VJ_( "VJ.*");
     std::regex subject( "(Ss.*)|(SFut)|(Sp\*.*)" );
     std::regex Spx( "Spx.*" );
     // TODO:fix theese initializer list not allowed                       ###FIXED###
     std::regex predicate( "(Pv.*)|(Pg.*)|(PP.*)|(I.*)|(TO)|(MVi.*)" );
     // TODO: make one from theese // (Sp.*)|(Ss.*)                        ###FIXED###
     std::regex noun_adject_object ( "(O.*)|(Os.*)|(Op.*)|(MVpn.*)|(Pa.*)|(MVa.*)" );
     std::regex preposition ( "(MVp.*)|(Pp.*)|(OF)|(TO)" );
     std::regex prep_object ( "(J.*)|(TI)|(I.*)|(ON)" );
     // TODO: problems with matching!! Pg*!!                               ###FIXED###
     // TODO: problems with matching!! Mvp.*!!                             ###FIXED###

     bool s_found = false;
     bool p_found = false;
     bool o_found = false;
     bool SJ = false;

     // search for SJ.s labels
     for( auto label: labels )
     {
       if( std::regex_match( label, SJ_ ) )
       {
         SJ = true;
         break;
       }
     }

     // multiple subject in the sentence
     if( SJ )
     {
       // SPls left -> first subject
       // SPrs right -> second subject
       // Spx right -> predicate
       // SJ-s are multiple subjects
       std::string temp;
       // go through every linkage
       for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
       {
         // get their label
         std::string l = linkage_get_link_label( linkage, i );
         // if there is an SJl* label
         if( std::regex_match( l, std::regex( "SJl.*" ) ) )
         {
           // SJls left side
           triplet.s = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
           triplet.cut( triplet.s );
           temp = triplet.s + " ";
           // and word
           triplet.s = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
           triplet.cut( triplet.s );
           temp += triplet.s + " ";

           // find SJr*
           for( int j = 0; j < linkage_get_num_links( linkage ); ++j )
           {
             std::string m = linkage_get_link_label( linkage, j );
             if( std::regex_match( m, std::regex( "SJr.*" ) ) )
             {
               triplet.s = linkage_get_word( linkage, linkage_get_link_rword( linkage, j ) );
               triplet.cut();
               temp += triplet.s;
               triplet.s = temp;

               s_found = true;
               #ifdef DEBUG
                 std::cout << "Subject found: " << triplet.s << std::endl;
               #endif
               break;
             } // if
           } // for
           break;
         } // if
       } // for

       // now we have the subject

       // find Spx and its right side will be the starter predicate
       std::string current_word;
       for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
       {
         std::string l = linkage_get_link_label( linkage, i );
         if( std::regex_match( l, std::regex( "Spx.*" ) ) )
         {
           triplet.p = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
           current_word = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
         }
       }
       // from now all the same as on the else branch !!!!

       bool predicate_match = false;

       // search for the linkage that has triplet.s as left!
       do
       {
         predicate_match = false;

         for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
         {
           // every linkage's left word
           std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
           // every linkage's label
           std::string l = linkage_get_link_label( linkage, i );

           if( std::regex_match( l, predicate ) && word_i == current_word )
           {
             // found predicate
             triplet.p = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
             current_word = triplet.p;
             predicate_match = true;
             break;
           }
         }
       }
       while( predicate_match );

       // we now have the predicate too
       // TODO: multiple predicates!
       p_found = true;
       #ifdef DEBUG
         std::cout << "Predicate found: " << triplet.p << std::endl;
       #endif

       // ###COPY BEGIN###

       // search for noun object or adjective object
       for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
       {
         // get every linkage label
         std::string l = linkage_get_link_label( linkage, i );
         // get the left word of every linkage
         std::string l_word = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
         // if thete is a label that match AND its left word is the predicate
         if( std::regex_match( l, noun_adject_object ) && triplet.p == l_word )
         {
           // then the object is that linkage's right word
           triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
           triplet.cut( triplet.o );
           o_found = true;
           #ifdef DEBUG
             std::cout << "Adjective or noun object found: " << triplet.o << std::endl;
           #endif
         } // if
       } // for

       // still not found object, then search for preposition
       if( !o_found )
       {
         // go through every linkage
         for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
         {
           // get the linkage's label
           std::string l = linkage_get_link_label( linkage, i );
           // and left word
           std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
           // if there is a linkage which is a preposition and its left word is the predicate
           if( std::regex_match( l, preposition ) && triplet.p == word_i )
           {
             // found preposition
             // search for prep_object
             // then the temp will contain the preposition label's right word
             std::string temp = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
             #ifdef DEBUG
               std::cout << "Preposition found! and its rigth word is: " << temp << std::endl;
             #endif

             for( int j = 0; j < linkage_get_num_links( linkage ); ++j )
             {
               // every linkages
               std::string m = linkage_get_link_label( linkage, j );
               // every left word
               std::string word_j = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) );

               // if there is a label with match and its left is exactly the preposition's right
               if( std::regex_match( m, prep_object ) && temp == word_j )
               {
                 triplet.o = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) );
                 triplet.cut(triplet.o);

                 triplet.o += " ";
                 // save o
                 std::string temp = triplet.o;

                 triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, j ) );
                 triplet.cut(triplet.o);
                 temp += triplet.o;

                 triplet.o = temp;
                 o_found = true;
                 #ifdef DEBUG
                   std::cout << "Object found: " << triplet.o << std::endl;
                 #endif
               } // if( std::regex_match( m, prep_object ) && temp == word_j ) END
             } // for J END
           } // if( std::regex_match( l, preposition ) && triplet.p == word_i ) END
         } // for I END
       } // if( !o_found ) END

       if( s_found && p_found && o_found )
       {
         // TODO: cut the words itself not the whole triplet
         // have to cut every word itself
         // triplet.cut();
         triplet.cut(triplet.s);
         triplet.cut(triplet.p);
         triplets.push_back( triplet );
         s_found = false;
         p_found = false;
         o_found = false;
       }
       // ###COPY END###
     }
     else // only one subject
     {
       // except Spx!!!
       // S left -> subject
       // S right -> predicate at first
       // if the word next to S right, is an element of Pv*, Pg* PP*, I*, TO, MVi*
       // then the new predicate will be that word

       std::string current_word;

       // search for subject (S_link)
       for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
       {
         // get the linkage's label
         std::string l = linkage_get_link_label( linkage, i );

         if( std::regex_match( l, subject ) )
         {
           // subject found
           triplet.s = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
           s_found = true;
           current_word = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
           triplet.p = current_word;
           #ifdef DEBUG
             std::cout << "Subject found: " << triplet.s << std::endl;
           #endif
           break;
         }
       }

       if( s_found )
       {
         bool predicate_match = false;

         // search for the linkage that has triplet.s as left!
         do
         {
           predicate_match = false;

           for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
           {
             // every linkage's left word
             std::string l_word = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
             // every linkage's label
             std::string l = linkage_get_link_label( linkage, i );

             if( std::regex_match( l, predicate ) && l_word == current_word )
             {
               // found predicate
               triplet.p = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
               current_word = triplet.p;
               predicate_match = true;
               break;
             }
           } // for END
         } while( predicate_match );

         p_found = true;
         #ifdef DEBUG
           std::cout << "Predicate found: " << triplet.p << std::endl;
         #endif
       } // if( s_found ) END

       // subject and predicate found
       // search for object

       // from k to linkage_get_num_links( linkage )
       // if there is any of the noun, adjective od preposition object then that
       // label's right will give the object.

       // !!! search only between labels that has triplet.p as left word !!!!!

       // search for noun object or adjective objects
       // go through all links
       for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
       {
         // get every linkage label
         std::string l = linkage_get_link_label( linkage, i );
         // get the left word of every linkage
         std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
         // if thete is a label that match AND its left word is the predicate
         if( std::regex_match( l, noun_adject_object ) && triplet.p == word_i )
         {
           // then the object is that linkage's right word
           triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
           o_found = true;
           triplet.cut(triplet.o);
           #ifdef DEBUG
             std::cout << "Adjective or noun object found: " << triplet.o << std::endl;
           #endif
         } // if END
       } // for END

       // still not found object, then search for preposition
       if( !o_found )
       {
         // go through every linkage
         for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
         {
           // get the linkage's label
           std::string l = linkage_get_link_label( linkage, i );
           // and left word
           std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );

           // if there is a linkage which is a preposition and its left word is the predicate
           if( std::regex_match( l, preposition ) && triplet.p == word_i )
           {
             // found preposition
             // search for prep_object
             // then the temp will contain the preposition label's right word
             std::string temp = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
             #ifdef DEBUG
               std::cout << "Preposition found! and its rigth word is: " << temp << std::endl;
             #endif

             // start search from there
             for( int j = 0; j < linkage_get_num_links( linkage ); ++j )
             {
               // every linkages
               std::string m = linkage_get_link_label( linkage, j );
               // every left word
               std::string word_j = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) );
               #ifdef DEBUG
                 if( std::regex_match( m, prep_object ) )
                     std::cout << m << " DOES match to (J.*)|(TI)|(I.*)|(ON)" << std::endl;
               #endif

               // if there is a label with match and its left is exactly the preposition's right
               if( std::regex_match( m, prep_object ) && temp == word_j )
               {
                 triplet.o = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) );
                 triplet.cut(triplet.o);

                 triplet.o += " ";
                 // save o
                 std::string temp = triplet.o;

                 triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, j ) );
                 triplet.cut(triplet.o);
                 temp += triplet.o;

                 triplet.o = temp;
                 #ifdef DEBUG
                   std::cout << "Object found: " << triplet.o << std::endl;
                 #endif
                 o_found = true;
               }
             } // for
           } // if
         } // for
       } // if( o_found ) END

       if( s_found && p_found && o_found )
       {
         // TODO: cut the words itself not the whole triplet ###FIXED###
         // have to cut every word itself
         // triplet.cut();

         triplet.cut(triplet.s);
         triplet.cut(triplet.p);
         triplets.push_back( triplet );
         s_found = false;
         p_found = false;
         o_found = false;
       }

     } // end else

     linkage_delete ( linkage );
   } // if( num_linkages > 0 ) END
예제 #5
0
static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage,
                                          int numcon_total, int s)
{
	int d, c, leftlimit, l, leftmost, rightmost, w, c2, numcon_subl=0, w2;
	List_o_links * dlink;
	int rootright, rootleft, adjustment_made;
	Sublinkage * subl;
	const char * name;
	Domain domain;

	r_limit = linkage->num_words-2; /**PV**/

	subl = &linkage->sublinkage[s];

	for (d=0, c=numcon_total; d<subl->pp_data.N_domains; d++, c++) {
		domain = subl->pp_data.domain_array[d];
		rootright = linkage_get_link_rword(linkage, domain.start_link);
		rootleft =  linkage_get_link_lword(linkage, domain.start_link);

		if ((domain.type=='c') ||
			(domain.type=='d') ||
			(domain.type=='e') ||
			(domain.type=='f') ||
			(domain.type=='g') ||
			(domain.type=='u') ||
			(domain.type=='y')) {
			leftlimit = 0;
			leftmost = linkage_get_link_lword(linkage, domain.start_link);
			rightmost = linkage_get_link_lword(linkage, domain.start_link);
		}
		else {
			leftlimit = linkage_get_link_lword(linkage, domain.start_link)+1;
			leftmost = linkage_get_link_rword(linkage, domain.start_link);
			rightmost = linkage_get_link_rword(linkage, domain.start_link);
		}

		/* Start by assigning both left and right limits to the
		   right word of the start link. This will always be contained
		   in the constituent. This will also handle the case
		   where the domain contains no links. */

		for (dlink = domain.lol; dlink!=NULL; dlink=dlink->next) {
			l=dlink->link;

			if ((linkage_get_link_lword(linkage, l) < leftmost) &&
				(linkage_get_link_lword(linkage, l) >= leftlimit))
				leftmost = linkage_get_link_lword(linkage, l);

			if (linkage_get_link_rword(linkage, l) > rightmost)
				rightmost = linkage_get_link_rword(linkage, l);
		}

		c--;
		c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost,
						cons_of_domain(domain.type));

		if (domain.type=='z') {
			c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
		}
		if (domain.type=='c') {
			c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
		}
		if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("Rn", ctxt->constituent[c].start_link)==1)) {
			c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "SBAR");
		}
		if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("MX#r", ctxt->constituent[c].start_link)==1)) {
			w=leftmost;
			if (strcmp(linkage->word[w], ",")==0) w++;
			c = add_constituent(ctxt, c, linkage, domain, w, w, "WHNP");
		}
		if (post_process_match("Mj", ctxt->constituent[c].start_link)==1) {
			w=leftmost;
			if (strcmp(linkage->word[w], ",")==0) w++;
			c = add_constituent(ctxt, c, linkage, domain, w, w+1, "WHPP");
			c = add_constituent(ctxt, c, linkage, domain, w+1, w+1, "WHNP");
		}
		if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("B#d", ctxt->constituent[c].start_link)==1)) {
			c = add_constituent(ctxt, c, linkage, domain, rootleft, rootleft, "WHNP");
			c = add_constituent(ctxt, c, linkage, domain,
							rootleft, ctxt->constituent[c-1].right, "SBAR");
		}
		if (post_process_match("CP", ctxt->constituent[c].start_link)==1) {
			if (strcmp(linkage->word[leftmost], ",")==0)
				ctxt->constituent[c].left++;
			c = add_constituent(ctxt, c, linkage, domain, 1, linkage->num_words-1, "S");
		}
		if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) ||
			(domain.type=='f')) {
			w=ctxt->constituent[c].left;
			if (strcmp(linkage->word[w], ",")==0)
				w++;
			if (strcmp(linkage->word[w], "when")==0) {
				c = add_constituent(ctxt, c, linkage, domain, w, w, "WHADVP");
			}
		}
		if (domain.type=='t') {
			c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
		}
		if ((post_process_match("QI", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("Mr", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("MX#d", ctxt->constituent[c].start_link)==1)) {
			w = leftmost;
			if (strcmp(linkage->word[w], ",")==0) w++;
			if (ctxt->wordtype[w] == NONE)
				name = "WHADVP";
			else if (ctxt->wordtype[w] == QTYPE)
				name = "WHNP";
			else if (ctxt->wordtype[w] == QDTYPE)
				name = "WHNP";
			else
				assert(0, "Unexpected word type");
			c = add_constituent(ctxt, c, linkage, domain, w, w, name);

			if (ctxt->wordtype[w] == QDTYPE) {
				/* Now find the finite verb to the right, start an S */
				/* Limit w2 to sentence length. */
				// for( w2=w+1; w2 < r_limit-1; w2++ )
				for (w2 = w+1; w2 < rightmost; w2++)
				  if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break;

				/* Adjust the right boundary of previous constituent */
				ctxt->constituent[c].right = w2-1;
				c = add_constituent(ctxt, c, linkage, domain, w2, rightmost, "S");
			  }
		}

		if (ctxt->constituent[c].domain_type=='\0') {
			error("Error: no domain type assigned to constituent\n");
		}
		if (ctxt->constituent[c].start_link==NULL) {
			error("Error: no type assigned to constituent\n");
		}
	}

	numcon_subl = c - numcon_total;
	/* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl);  */

	if (verbosity >= 2)
		printf("Constituents added at first stage for subl %d:\n",
			   linkage->current);
	for (c = numcon_total; c < numcon_total + numcon_subl; c++)
	{
		print_constituent(ctxt, linkage, c);
	}

	/* Opener case - generates S around main clause.
	   (This must be done first; the S generated will be needed for
	   later cases.) */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", 5);

	/* pp opener case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", 6);

	/* participle opener case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", 9);

	/* Subject-phrase case; every main VP generates an S */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", 1);

	/* Relative clause case; an SBAR generates a complement NP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", 3);

	/* Participle modifier case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", 8);

	/* PP modifying NP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", 8);

	/* Appositive case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", 4);

	/* S-V inversion case; an NP generates a complement VP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", 7);

	adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl);
	for (c=numcon_total; c<numcon_total + numcon_subl; c++) {
		if ((ctxt->constituent[c].domain_type=='p') &&
			(strcmp(linkage->word[ctxt->constituent[c].left], ",")==0)) {
			ctxt->constituent[c].left++;
		}
	}

	/* Make sure the constituents are nested. If two constituents
	 * are not nested: whichever constituent has the furthest left
	 * boundary, shift that boundary rightwards to the left boundary
	 * of the other one.
	 */
	while (1) {
		adjustment_made=0;
		for (c=numcon_total; c<numcon_total + numcon_subl; c++) {
			for (c2=numcon_total; c2<numcon_total + numcon_subl; c2++) {
				if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) &&
					(ctxt->constituent[c].right < ctxt->constituent[c2].right) &&
					(ctxt->constituent[c].right >= ctxt->constituent[c2].left)) {

					/* We've found two overlapping constituents.
					   If one is larger, except the smaller one
					   includes an extra comma, adjust the smaller one
					   to exclude the comma */

					if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",")==0) ||
						(strcmp(linkage->word[ctxt->constituent[c2].right],
								"RIGHT-WALL")==0)) {
						if (verbosity>=2)
							printf("Adjusting %d to fix comma overlap\n", c2);
						adjust_for_right_comma(ctxt, linkage, c2);
						adjustment_made=1;
					}
					else if (strcmp(linkage->word[ctxt->constituent[c].left], ",")==0) {
						if (verbosity>=2)
							printf("Adjusting c %d to fix comma overlap\n", c);
						adjust_for_left_comma(ctxt, linkage, c);
						adjustment_made=1;
					}
					else {
					  if (verbosity>=2) {
						printf("WARNING: the constituents aren't nested! Adjusting them." \
							   "(%d, %d)\n", c, c2);
					  }
					  ctxt->constituent[c].left = ctxt->constituent[c2].left;
					}
				}
			}
		}
		if (adjustment_made==0) break;
	}

	/* This labels certain words as auxiliaries (such as forms of "be"
	   with passives, forms of "have" wth past participles,
	   "to" with infinitives). These words start VP's which include
	   them. In Treebank I, these don't get printed unless they're part of an
	   andlist, in which case they get labeled "X". (this is why we need to
	   label them as "aux".) In Treebank II, however, they seem to be treated
	   just like other verbs, so the "aux" stuff isn't needed. */


	for (c=numcon_total; c<numcon_total + numcon_subl; c++) {
		ctxt->constituent[c].subl = linkage->current;
		if (((ctxt->constituent[c].domain_type == 'v') &&
			(ctxt->wordtype[linkage_get_link_rword(linkage,
											 ctxt->constituent[c].start_num)]==PTYPE))
		   ||
		   ((ctxt->constituent[c].domain_type == 't') &&
			(strcmp(ctxt->constituent[c].type, "VP")==0))) {
			ctxt->constituent[c].aux=1;
		}
		else ctxt->constituent[c].aux=0;
	}

	for (c=numcon_total; c<numcon_total + numcon_subl; c++) {
		ctxt->constituent[c].subl = linkage->current;
		ctxt->constituent[c].aux=0;
	}

	return numcon_subl;
}
예제 #6
0
static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage,
                                          int numcon_total)
{
	size_t d, l, w2;
	int c, w, c2, numcon_subl = 0;

	for (d = 0, c = numcon_total; d < linkage->hpsg_pp_data.N_domains; d++, c++)
	{
		size_t leftmost, rightmost, leftlimit;
		int rootleft;
		List_o_links * dlink;

		Domain domain = linkage->hpsg_pp_data.domain_array[d];

		// rootright = linkage_get_link_rword(linkage, domain.start_link);
		rootleft =  linkage_get_link_lword(linkage, domain.start_link);

		if ((domain.type=='c') ||
			(domain.type=='d') ||
			(domain.type=='e') ||
			(domain.type=='f') ||
			(domain.type=='g') ||
			(domain.type=='u') ||
			(domain.type=='y'))
		{
			leftlimit = 0;
			leftmost = linkage_get_link_lword(linkage, domain.start_link);
			rightmost = linkage_get_link_lword(linkage, domain.start_link);
		}
		else
		{
			leftlimit = linkage_get_link_lword(linkage, domain.start_link) + 1;
			leftmost = linkage_get_link_rword(linkage, domain.start_link);
			rightmost = linkage_get_link_rword(linkage, domain.start_link);
		}

		/* Start by assigning both left and right limits to the
		 * right word of the start link. This will always be contained
		 * in the constituent. This will also handle the case
		 * where the domain contains no links.
		 */
		for (dlink = domain.lol; dlink != NULL; dlink = dlink->next)
		{
			l = dlink->link;

			if ((linkage_get_link_lword(linkage, l) < leftmost) &&
				(linkage_get_link_lword(linkage, l) >= leftlimit))
			{
				leftmost = linkage_get_link_lword(linkage, l);
			}

			if (linkage_get_link_rword(linkage, l) > rightmost)
			{
				rightmost = linkage_get_link_rword(linkage, l);
			}
		}

		c--;
		c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost,
						cons_of_domain(linkage, domain.type));

		if (domain.type == 'z')
		{
			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S");
		}
		if (domain.type=='c')
		{
			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S");
		}
		if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("Rn", ctxt->constituent[c].start_link)==1))
		{
			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "SBAR");
		}
		if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("MX#r", ctxt->constituent[c].start_link)==1))
		{
			w = leftmost;
			if (strcmp(linkage->word[w], ",") == 0) w++;
			c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHNP");
		}
		if (post_process_match("Mj", ctxt->constituent[c].start_link) == 1)
		{
			w = leftmost;
			if (strcmp(linkage->word[w], ",") == 0) w++;
			c = add_constituent(ctxt, c, linkage, &domain, w, w+1, "WHPP");
			c = add_constituent(ctxt, c, linkage, &domain, w+1, w+1, "WHNP");
		}
		if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("B#d", ctxt->constituent[c].start_link)==1))
		{
			c = add_constituent(ctxt, c, linkage, &domain, rootleft, rootleft, "WHNP");
			c = add_constituent(ctxt, c, linkage, &domain,
							rootleft, ctxt->constituent[c-1].right, "SBAR");
		}
		if (post_process_match("CP", ctxt->constituent[c].start_link)==1)
		{
			if (strcmp(linkage->word[leftmost], ",") == 0)
				ctxt->constituent[c].left++;
			c = add_constituent(ctxt, c, linkage, &domain, 1, linkage->num_words-1, "S");
		}
		if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) ||
			(domain.type=='f'))
		{
			w = ctxt->constituent[c].left;
			if (strcmp(linkage->word[w], ",") == 0)
				w++;
			if (strcmp(linkage->word[w], "when") == 0)
			{
				c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHADVP");
			}
		}
		if (domain.type=='t')
		{
			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S");
		}
		if ((post_process_match("QI", ctxt->constituent[c].start_link) == 1) ||
			(post_process_match("Mr", ctxt->constituent[c].start_link) == 1) ||
			(post_process_match("MX#d", ctxt->constituent[c].start_link) == 1))
		{
			const char * name = "";
			w = leftmost;
			if (strcmp(linkage->word[w], ",") == 0) w++;
			if (ctxt->wordtype[w] == NONE)
				name = "WHADVP";
			else if (ctxt->wordtype[w] == QTYPE)
				name = "WHNP";
			else if (ctxt->wordtype[w] == QDTYPE)
				name = "WHNP";
			else
				assert(0, "Unexpected word type");
			c = add_constituent(ctxt, c, linkage, &domain, w, w, name);

			if (ctxt->wordtype[w] == QDTYPE)
			{
				/* Now find the finite verb to the right, start an S */
				/* Limit w2 to sentence length. */
				// for( w2=w+1; w2 < ctxt->r_limit-1; w2++ )
				for (w2 = w+1; w2 < rightmost; w2++)
				  if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break;

				/* Adjust the right boundary of previous constituent */
				ctxt->constituent[c].right = w2 - 1;
				c = add_constituent(ctxt, c, linkage, &domain, w2, rightmost, "S");
			}
		}

		if (ctxt->constituent[c].domain_type == '\0')
		{
			err_ctxt ec;
			err_msg(&ec, Error, "Error: no domain type assigned to constituent\n");
		}
		if (ctxt->constituent[c].start_link == NULL)
		{
			err_ctxt ec;
			err_msg(&ec, Error, "Error: no type assigned to constituent\n");
		}
	}

	numcon_subl = c - numcon_total;
	/* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl);  */

	if (verbosity >= 2)
		printf("Constituents added at first stage:\n");

	for (c = numcon_total; c < numcon_total + numcon_subl; c++)
	{
		print_constituent(ctxt, linkage, c);
	}

	/* Opener case - generates S around main clause.
	   (This must be done first; the S generated will be needed for
	   later cases.) */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", CASE_OPENER);

	/* pp opener case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", CASE_PPOPEN);

	/* participle opener case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", CASE_PART_OPEN);

	/* Subject-phrase case; every main VP generates an S */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", CASE_S);

	/* Relative clause case; an SBAR generates a complement NP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", CASE_REL_CLAUSE);

	/* Participle modifier case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", CASE_PART_MOD);

	/* PP modifying NP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", CASE_PART_MOD);

	/* Appositive case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", CASE_APPOS);

	/* S-V inversion case; an NP generates a complement VP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", CASE_SVINV);

	adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl);
	for (c = numcon_total; c < numcon_total + numcon_subl; c++)
	{
		if ((ctxt->constituent[c].domain_type=='p') &&
			(strcmp(linkage->word[ctxt->constituent[c].left], ",")==0))
		{
			ctxt->constituent[c].left++;
		}
	}

	/* Make sure the constituents are nested. If two constituents
	 * are not nested: whichever constituent has the furthest left
	 * boundary, shift that boundary rightwards to the left boundary
	 * of the other one.
	 */
	while (true)
	{
		bool adjustment_made = false;
		for (c = numcon_total; c < numcon_total + numcon_subl; c++)
		{
			for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++)
			{
				if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) &&
					(ctxt->constituent[c].right < ctxt->constituent[c2].right) &&
					(ctxt->constituent[c].right >= ctxt->constituent[c2].left))
				{
					/* We've found two overlapping constituents.
					   If one is larger, except the smaller one
					   includes an extra comma, adjust the smaller one
					   to exclude the comma */

					if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",") == 0) ||
						(strcmp(linkage->word[ctxt->constituent[c2].right],
								"RIGHT-WALL") == 0))
					{
						if (verbosity >= 2)
							printf("Adjusting %d to fix comma overlap\n", c2);
						adjust_for_right_comma(ctxt, linkage, c2);
						adjustment_made = true;
					}
					else if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0)
					{
						if (verbosity >= 2)
							printf("Adjusting c %d to fix comma overlap\n", c);
						adjust_for_left_comma(ctxt, linkage, c);
						adjustment_made = true;
					}
					else
					{
						if (verbosity >= 2)
						{
							err_ctxt ec;
							err_msg(&ec, Warn,
							      "Warning: the constituents aren't nested! "
							      "Adjusting them. (%d, %d)\n", c, c2);
					  }
					  ctxt->constituent[c].left = ctxt->constituent[c2].left;
					}
				}
			}
		}
		if (adjustment_made == false) break;
	}

	assert (numcon_total + numcon_subl < ctxt->conlen, "Too many constituents");
	return numcon_subl;
}