/** * This function generates a table, word_used[i][w], showing * whether each word w is used in each sublinkage i; if so, * the value for that cell of the table is 1. */ static void count_words_used(con_context_t *ctxt, Linkage linkage) { int i, w, link, num_subl; num_subl = linkage->num_sublinkages; if(linkage->unionized==1 && num_subl>1) num_subl--; if (verbosity>=2) printf("Number of sublinkages = %d\n", num_subl); for (i=0; i<num_subl; i++) { for (w=0; w<linkage->num_words; w++) ctxt->word_used[i][w]=0; linkage->current=i; for (link=0; link<linkage_get_num_links(linkage); link++) { ctxt->word_used[i][linkage_get_link_lword(linkage, link)]=1; ctxt->word_used[i][linkage_get_link_rword(linkage, link)]=1; } if (verbosity>=2) { printf("Sublinkage %d: ", i); for (w=0; w<linkage->num_words; w++) { if (ctxt->word_used[i][w]==0) printf("0 "); if (ctxt->word_used[i][w]==1) printf("1 "); } printf("\n"); } } }
/** * Go through all the words. If a word is on the right end of * an S (or SF or SX), wordtype[w]=STYPE. If it's also on the left end of a * Pg*b, I, PP, or Pv, wordtype[w]=PTYPE. If it's a question-word * used in an indirect question, wordtype[w]=QTYPE. If it's a * question-word determiner, wordtype[w]=QDTYPE. Else wordtype[w]=NONE. * (This function is called once for each sublinkage.) */ static void generate_misc_word_info(con_context_t * ctxt, Linkage linkage) { int l1, l2, w1, w2; const char * label1, * label2; for (w1=0; w1<linkage->num_words; w1++) ctxt->wordtype[w1]=NONE; for (l1=0; l1<linkage_get_num_links(linkage); l1++) { w1=linkage_get_link_rword(linkage, l1); label1 = linkage_get_link_label(linkage, l1); if ((uppercompare(label1, "S")==0) || (uppercompare(label1, "SX")==0) || (uppercompare(label1, "SF")==0)) { ctxt->wordtype[w1] = STYPE; for (l2=0; l2<linkage_get_num_links(linkage); l2++) { w2=linkage_get_link_lword(linkage, l2); label2 = linkage_get_link_label(linkage, l2); if ((w1==w2) && ((post_process_match("Pg#b", label2)==1) || (uppercompare(label2, "I")==0) || (uppercompare(label2, "PP")==0) || (post_process_match("Pv", label2)==1))) { /* Pvf, Pgf? */ ctxt->wordtype[w1] = PTYPE; } } } if (post_process_match("QI#d", label1)==1) { ctxt->wordtype[w1] = QTYPE; for (l2=0; l2<linkage_get_num_links(linkage); l2++) { w2=linkage_get_link_lword(linkage, l2); label2 = linkage_get_link_label(linkage, l2); if ((w1==w2) && (post_process_match("D##w", label2)==1)) { ctxt->wordtype[w1] = QDTYPE; } } } if (post_process_match("Mr", label1)==1) ctxt->wordtype[w1] = QDTYPE; if (post_process_match("MX#d", label1)==1) ctxt->wordtype[w1] = QDTYPE; } }
/* * Class: LinkGrammar * Method: getLinkRWord * Signature: (I)I */ JNIEXPORT jint JNICALL Java_org_linkgrammar_LinkGrammar_getLinkRWord(JNIEnv *env, jclass cls, jint i) { per_thread_data *ptd = get_ptd(env, cls); return linkage_get_link_rword(ptd->linkage, i); }
SPOTriplets NLP::sentence2triplets ( const char* sentence ) { // vector of triplets SPOTriplets triplets; #ifdef DEBUG std::cout << "The sentence: " << sentence << std::endl; #endif // creates a Sentence from the input char* Sentence sent = sentence_create ( sentence, dict_ ); #ifdef DEBUG std::cout << "Sentence created" << std::endl; #endif // tokenizes the sentence sentence_split ( sent, parse_opts_ ); #ifdef DEBUG std::cout << "Sentence splitted" << std::endl; #endif // searches for all possible linkages int num_linkages = sentence_parse ( sent, parse_opts_ ); #ifdef DEBUG std::cout << "Sentence parsed" << std::endl; std::cout << "Number of linkages: " << num_linkages << std::endl; #endif // just one triplet SPOTriplet triplet; // if there is any linkage in the sentence if( num_linkages > 0 ) { // create the linkage Linkage linkage = linkage_create ( 0, sent, parse_opts_ ); #ifdef DEBUG // prints the sentence's diagram std::cout << "The diagram: " << std::endl; char *diagram = linkage_print_diagram(linkage, true, 800); std::cout << diagram << std::endl; linkage_free_diagram( diagram ); // end print diagram #endif std::vector<std::string> labels; // 1. find the S_link // S* except there is an SJ* because then S* except Spx // two cases: there is SJ* and there is not SJ* // TODO: VJlp VJrp same as SJ but to predications // TODO: SFut SFst what the f**k? ###FIXED### // TODO: His form was shining like the light not working ###FIXED### // TODO: Car is mine not working ###FIXED### // TODO: The little brown bear has eaten all of the honey not working ###FIXED### // REGEXES std::regex SJ_( "SJ.*" ); std::regex VJ_( "VJ.*"); std::regex subject( "(Ss.*)|(SFut)|(Sp\*.*)" ); std::regex Spx( "Spx.*" ); // TODO:fix theese initializer list not allowed ###FIXED### std::regex predicate( "(Pv.*)|(Pg.*)|(PP.*)|(I.*)|(TO)|(MVi.*)" ); // TODO: make one from theese // (Sp.*)|(Ss.*) ###FIXED### std::regex noun_adject_object ( "(O.*)|(Os.*)|(Op.*)|(MVpn.*)|(Pa.*)|(MVa.*)" ); std::regex preposition ( "(MVp.*)|(Pp.*)|(OF)|(TO)" ); std::regex prep_object ( "(J.*)|(TI)|(I.*)|(ON)" ); // TODO: problems with matching!! Pg*!! ###FIXED### // TODO: problems with matching!! Mvp.*!! ###FIXED### bool s_found = false; bool p_found = false; bool o_found = false; bool SJ = false; // search for SJ.s labels for( auto label: labels ) { if( std::regex_match( label, SJ_ ) ) { SJ = true; break; } } // multiple subject in the sentence if( SJ ) { // SPls left -> first subject // SPrs right -> second subject // Spx right -> predicate // SJ-s are multiple subjects std::string temp; // go through every linkage for( int i = 0; i < linkage_get_num_links( linkage ); ++i ) { // get their label std::string l = linkage_get_link_label( linkage, i ); // if there is an SJl* label if( std::regex_match( l, std::regex( "SJl.*" ) ) ) { // SJls left side triplet.s = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) ); triplet.cut( triplet.s ); temp = triplet.s + " "; // and word triplet.s = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) ); triplet.cut( triplet.s ); temp += triplet.s + " "; // find SJr* for( int j = 0; j < linkage_get_num_links( linkage ); ++j ) { std::string m = linkage_get_link_label( linkage, j ); if( std::regex_match( m, std::regex( "SJr.*" ) ) ) { triplet.s = linkage_get_word( linkage, linkage_get_link_rword( linkage, j ) ); triplet.cut(); temp += triplet.s; triplet.s = temp; s_found = true; #ifdef DEBUG std::cout << "Subject found: " << triplet.s << std::endl; #endif break; } // if } // for break; } // if } // for // now we have the subject // find Spx and its right side will be the starter predicate std::string current_word; for( int i = 0; i < linkage_get_num_links( linkage ); ++i ) { std::string l = linkage_get_link_label( linkage, i ); if( std::regex_match( l, std::regex( "Spx.*" ) ) ) { triplet.p = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) ); current_word = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) ); } } // from now all the same as on the else branch !!!! bool predicate_match = false; // search for the linkage that has triplet.s as left! do { predicate_match = false; for( int i = 0; i < linkage_get_num_links( linkage ); ++i ) { // every linkage's left word std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) ); // every linkage's label std::string l = linkage_get_link_label( linkage, i ); if( std::regex_match( l, predicate ) && word_i == current_word ) { // found predicate triplet.p = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) ); current_word = triplet.p; predicate_match = true; break; } } } while( predicate_match ); // we now have the predicate too // TODO: multiple predicates! p_found = true; #ifdef DEBUG std::cout << "Predicate found: " << triplet.p << std::endl; #endif // ###COPY BEGIN### // search for noun object or adjective object for( int i = 0; i < linkage_get_num_links( linkage ); ++i ) { // get every linkage label std::string l = linkage_get_link_label( linkage, i ); // get the left word of every linkage std::string l_word = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) ); // if thete is a label that match AND its left word is the predicate if( std::regex_match( l, noun_adject_object ) && triplet.p == l_word ) { // then the object is that linkage's right word triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) ); triplet.cut( triplet.o ); o_found = true; #ifdef DEBUG std::cout << "Adjective or noun object found: " << triplet.o << std::endl; #endif } // if } // for // still not found object, then search for preposition if( !o_found ) { // go through every linkage for( int i = 0; i < linkage_get_num_links( linkage ); ++i ) { // get the linkage's label std::string l = linkage_get_link_label( linkage, i ); // and left word std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) ); // if there is a linkage which is a preposition and its left word is the predicate if( std::regex_match( l, preposition ) && triplet.p == word_i ) { // found preposition // search for prep_object // then the temp will contain the preposition label's right word std::string temp = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) ); #ifdef DEBUG std::cout << "Preposition found! and its rigth word is: " << temp << std::endl; #endif for( int j = 0; j < linkage_get_num_links( linkage ); ++j ) { // every linkages std::string m = linkage_get_link_label( linkage, j ); // every left word std::string word_j = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) ); // if there is a label with match and its left is exactly the preposition's right if( std::regex_match( m, prep_object ) && temp == word_j ) { triplet.o = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) ); triplet.cut(triplet.o); triplet.o += " "; // save o std::string temp = triplet.o; triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, j ) ); triplet.cut(triplet.o); temp += triplet.o; triplet.o = temp; o_found = true; #ifdef DEBUG std::cout << "Object found: " << triplet.o << std::endl; #endif } // if( std::regex_match( m, prep_object ) && temp == word_j ) END } // for J END } // if( std::regex_match( l, preposition ) && triplet.p == word_i ) END } // for I END } // if( !o_found ) END if( s_found && p_found && o_found ) { // TODO: cut the words itself not the whole triplet // have to cut every word itself // triplet.cut(); triplet.cut(triplet.s); triplet.cut(triplet.p); triplets.push_back( triplet ); s_found = false; p_found = false; o_found = false; } // ###COPY END### } else // only one subject { // except Spx!!! // S left -> subject // S right -> predicate at first // if the word next to S right, is an element of Pv*, Pg* PP*, I*, TO, MVi* // then the new predicate will be that word std::string current_word; // search for subject (S_link) for( int i = 0; i < linkage_get_num_links( linkage ); ++i ) { // get the linkage's label std::string l = linkage_get_link_label( linkage, i ); if( std::regex_match( l, subject ) ) { // subject found triplet.s = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) ); s_found = true; current_word = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) ); triplet.p = current_word; #ifdef DEBUG std::cout << "Subject found: " << triplet.s << std::endl; #endif break; } } if( s_found ) { bool predicate_match = false; // search for the linkage that has triplet.s as left! do { predicate_match = false; for( int i = 0; i < linkage_get_num_links( linkage ); ++i ) { // every linkage's left word std::string l_word = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) ); // every linkage's label std::string l = linkage_get_link_label( linkage, i ); if( std::regex_match( l, predicate ) && l_word == current_word ) { // found predicate triplet.p = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) ); current_word = triplet.p; predicate_match = true; break; } } // for END } while( predicate_match ); p_found = true; #ifdef DEBUG std::cout << "Predicate found: " << triplet.p << std::endl; #endif } // if( s_found ) END // subject and predicate found // search for object // from k to linkage_get_num_links( linkage ) // if there is any of the noun, adjective od preposition object then that // label's right will give the object. // !!! search only between labels that has triplet.p as left word !!!!! // search for noun object or adjective objects // go through all links for( int i = 0; i < linkage_get_num_links( linkage ); ++i ) { // get every linkage label std::string l = linkage_get_link_label( linkage, i ); // get the left word of every linkage std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) ); // if thete is a label that match AND its left word is the predicate if( std::regex_match( l, noun_adject_object ) && triplet.p == word_i ) { // then the object is that linkage's right word triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) ); o_found = true; triplet.cut(triplet.o); #ifdef DEBUG std::cout << "Adjective or noun object found: " << triplet.o << std::endl; #endif } // if END } // for END // still not found object, then search for preposition if( !o_found ) { // go through every linkage for( int i = 0; i < linkage_get_num_links( linkage ); ++i ) { // get the linkage's label std::string l = linkage_get_link_label( linkage, i ); // and left word std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) ); // if there is a linkage which is a preposition and its left word is the predicate if( std::regex_match( l, preposition ) && triplet.p == word_i ) { // found preposition // search for prep_object // then the temp will contain the preposition label's right word std::string temp = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) ); #ifdef DEBUG std::cout << "Preposition found! and its rigth word is: " << temp << std::endl; #endif // start search from there for( int j = 0; j < linkage_get_num_links( linkage ); ++j ) { // every linkages std::string m = linkage_get_link_label( linkage, j ); // every left word std::string word_j = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) ); #ifdef DEBUG if( std::regex_match( m, prep_object ) ) std::cout << m << " DOES match to (J.*)|(TI)|(I.*)|(ON)" << std::endl; #endif // if there is a label with match and its left is exactly the preposition's right if( std::regex_match( m, prep_object ) && temp == word_j ) { triplet.o = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) ); triplet.cut(triplet.o); triplet.o += " "; // save o std::string temp = triplet.o; triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, j ) ); triplet.cut(triplet.o); temp += triplet.o; triplet.o = temp; #ifdef DEBUG std::cout << "Object found: " << triplet.o << std::endl; #endif o_found = true; } } // for } // if } // for } // if( o_found ) END if( s_found && p_found && o_found ) { // TODO: cut the words itself not the whole triplet ###FIXED### // have to cut every word itself // triplet.cut(); triplet.cut(triplet.s); triplet.cut(triplet.p); triplets.push_back( triplet ); s_found = false; p_found = false; o_found = false; } } // end else linkage_delete ( linkage ); } // if( num_linkages > 0 ) END
static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage, int numcon_total, int s) { int d, c, leftlimit, l, leftmost, rightmost, w, c2, numcon_subl=0, w2; List_o_links * dlink; int rootright, rootleft, adjustment_made; Sublinkage * subl; const char * name; Domain domain; r_limit = linkage->num_words-2; /**PV**/ subl = &linkage->sublinkage[s]; for (d=0, c=numcon_total; d<subl->pp_data.N_domains; d++, c++) { domain = subl->pp_data.domain_array[d]; rootright = linkage_get_link_rword(linkage, domain.start_link); rootleft = linkage_get_link_lword(linkage, domain.start_link); if ((domain.type=='c') || (domain.type=='d') || (domain.type=='e') || (domain.type=='f') || (domain.type=='g') || (domain.type=='u') || (domain.type=='y')) { leftlimit = 0; leftmost = linkage_get_link_lword(linkage, domain.start_link); rightmost = linkage_get_link_lword(linkage, domain.start_link); } else { leftlimit = linkage_get_link_lword(linkage, domain.start_link)+1; leftmost = linkage_get_link_rword(linkage, domain.start_link); rightmost = linkage_get_link_rword(linkage, domain.start_link); } /* Start by assigning both left and right limits to the right word of the start link. This will always be contained in the constituent. This will also handle the case where the domain contains no links. */ for (dlink = domain.lol; dlink!=NULL; dlink=dlink->next) { l=dlink->link; if ((linkage_get_link_lword(linkage, l) < leftmost) && (linkage_get_link_lword(linkage, l) >= leftlimit)) leftmost = linkage_get_link_lword(linkage, l); if (linkage_get_link_rword(linkage, l) > rightmost) rightmost = linkage_get_link_rword(linkage, l); } c--; c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, cons_of_domain(domain.type)); if (domain.type=='z') { c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S"); } if (domain.type=='c') { c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S"); } if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) || (post_process_match("Rn", ctxt->constituent[c].start_link)==1)) { c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "SBAR"); } if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) || (post_process_match("MX#r", ctxt->constituent[c].start_link)==1)) { w=leftmost; if (strcmp(linkage->word[w], ",")==0) w++; c = add_constituent(ctxt, c, linkage, domain, w, w, "WHNP"); } if (post_process_match("Mj", ctxt->constituent[c].start_link)==1) { w=leftmost; if (strcmp(linkage->word[w], ",")==0) w++; c = add_constituent(ctxt, c, linkage, domain, w, w+1, "WHPP"); c = add_constituent(ctxt, c, linkage, domain, w+1, w+1, "WHNP"); } if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) || (post_process_match("B#d", ctxt->constituent[c].start_link)==1)) { c = add_constituent(ctxt, c, linkage, domain, rootleft, rootleft, "WHNP"); c = add_constituent(ctxt, c, linkage, domain, rootleft, ctxt->constituent[c-1].right, "SBAR"); } if (post_process_match("CP", ctxt->constituent[c].start_link)==1) { if (strcmp(linkage->word[leftmost], ",")==0) ctxt->constituent[c].left++; c = add_constituent(ctxt, c, linkage, domain, 1, linkage->num_words-1, "S"); } if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) || (domain.type=='f')) { w=ctxt->constituent[c].left; if (strcmp(linkage->word[w], ",")==0) w++; if (strcmp(linkage->word[w], "when")==0) { c = add_constituent(ctxt, c, linkage, domain, w, w, "WHADVP"); } } if (domain.type=='t') { c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S"); } if ((post_process_match("QI", ctxt->constituent[c].start_link)==1) || (post_process_match("Mr", ctxt->constituent[c].start_link)==1) || (post_process_match("MX#d", ctxt->constituent[c].start_link)==1)) { w = leftmost; if (strcmp(linkage->word[w], ",")==0) w++; if (ctxt->wordtype[w] == NONE) name = "WHADVP"; else if (ctxt->wordtype[w] == QTYPE) name = "WHNP"; else if (ctxt->wordtype[w] == QDTYPE) name = "WHNP"; else assert(0, "Unexpected word type"); c = add_constituent(ctxt, c, linkage, domain, w, w, name); if (ctxt->wordtype[w] == QDTYPE) { /* Now find the finite verb to the right, start an S */ /* Limit w2 to sentence length. */ // for( w2=w+1; w2 < r_limit-1; w2++ ) for (w2 = w+1; w2 < rightmost; w2++) if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break; /* Adjust the right boundary of previous constituent */ ctxt->constituent[c].right = w2-1; c = add_constituent(ctxt, c, linkage, domain, w2, rightmost, "S"); } } if (ctxt->constituent[c].domain_type=='\0') { error("Error: no domain type assigned to constituent\n"); } if (ctxt->constituent[c].start_link==NULL) { error("Error: no type assigned to constituent\n"); } } numcon_subl = c - numcon_total; /* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl); */ if (verbosity >= 2) printf("Constituents added at first stage for subl %d:\n", linkage->current); for (c = numcon_total; c < numcon_total + numcon_subl; c++) { print_constituent(ctxt, linkage, c); } /* Opener case - generates S around main clause. (This must be done first; the S generated will be needed for later cases.) */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", 5); /* pp opener case */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", 6); /* participle opener case */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", 9); /* Subject-phrase case; every main VP generates an S */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", 1); /* Relative clause case; an SBAR generates a complement NP */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", 3); /* Participle modifier case */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", 8); /* PP modifying NP */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", 8); /* Appositive case */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", 4); /* S-V inversion case; an NP generates a complement VP */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", 7); adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl); for (c=numcon_total; c<numcon_total + numcon_subl; c++) { if ((ctxt->constituent[c].domain_type=='p') && (strcmp(linkage->word[ctxt->constituent[c].left], ",")==0)) { ctxt->constituent[c].left++; } } /* Make sure the constituents are nested. If two constituents * are not nested: whichever constituent has the furthest left * boundary, shift that boundary rightwards to the left boundary * of the other one. */ while (1) { adjustment_made=0; for (c=numcon_total; c<numcon_total + numcon_subl; c++) { for (c2=numcon_total; c2<numcon_total + numcon_subl; c2++) { if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) && (ctxt->constituent[c].right < ctxt->constituent[c2].right) && (ctxt->constituent[c].right >= ctxt->constituent[c2].left)) { /* We've found two overlapping constituents. If one is larger, except the smaller one includes an extra comma, adjust the smaller one to exclude the comma */ if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",")==0) || (strcmp(linkage->word[ctxt->constituent[c2].right], "RIGHT-WALL")==0)) { if (verbosity>=2) printf("Adjusting %d to fix comma overlap\n", c2); adjust_for_right_comma(ctxt, linkage, c2); adjustment_made=1; } else if (strcmp(linkage->word[ctxt->constituent[c].left], ",")==0) { if (verbosity>=2) printf("Adjusting c %d to fix comma overlap\n", c); adjust_for_left_comma(ctxt, linkage, c); adjustment_made=1; } else { if (verbosity>=2) { printf("WARNING: the constituents aren't nested! Adjusting them." \ "(%d, %d)\n", c, c2); } ctxt->constituent[c].left = ctxt->constituent[c2].left; } } } } if (adjustment_made==0) break; } /* This labels certain words as auxiliaries (such as forms of "be" with passives, forms of "have" wth past participles, "to" with infinitives). These words start VP's which include them. In Treebank I, these don't get printed unless they're part of an andlist, in which case they get labeled "X". (this is why we need to label them as "aux".) In Treebank II, however, they seem to be treated just like other verbs, so the "aux" stuff isn't needed. */ for (c=numcon_total; c<numcon_total + numcon_subl; c++) { ctxt->constituent[c].subl = linkage->current; if (((ctxt->constituent[c].domain_type == 'v') && (ctxt->wordtype[linkage_get_link_rword(linkage, ctxt->constituent[c].start_num)]==PTYPE)) || ((ctxt->constituent[c].domain_type == 't') && (strcmp(ctxt->constituent[c].type, "VP")==0))) { ctxt->constituent[c].aux=1; } else ctxt->constituent[c].aux=0; } for (c=numcon_total; c<numcon_total + numcon_subl; c++) { ctxt->constituent[c].subl = linkage->current; ctxt->constituent[c].aux=0; } return numcon_subl; }
static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage, int numcon_total) { size_t d, l, w2; int c, w, c2, numcon_subl = 0; for (d = 0, c = numcon_total; d < linkage->hpsg_pp_data.N_domains; d++, c++) { size_t leftmost, rightmost, leftlimit; int rootleft; List_o_links * dlink; Domain domain = linkage->hpsg_pp_data.domain_array[d]; // rootright = linkage_get_link_rword(linkage, domain.start_link); rootleft = linkage_get_link_lword(linkage, domain.start_link); if ((domain.type=='c') || (domain.type=='d') || (domain.type=='e') || (domain.type=='f') || (domain.type=='g') || (domain.type=='u') || (domain.type=='y')) { leftlimit = 0; leftmost = linkage_get_link_lword(linkage, domain.start_link); rightmost = linkage_get_link_lword(linkage, domain.start_link); } else { leftlimit = linkage_get_link_lword(linkage, domain.start_link) + 1; leftmost = linkage_get_link_rword(linkage, domain.start_link); rightmost = linkage_get_link_rword(linkage, domain.start_link); } /* Start by assigning both left and right limits to the * right word of the start link. This will always be contained * in the constituent. This will also handle the case * where the domain contains no links. */ for (dlink = domain.lol; dlink != NULL; dlink = dlink->next) { l = dlink->link; if ((linkage_get_link_lword(linkage, l) < leftmost) && (linkage_get_link_lword(linkage, l) >= leftlimit)) { leftmost = linkage_get_link_lword(linkage, l); } if (linkage_get_link_rword(linkage, l) > rightmost) { rightmost = linkage_get_link_rword(linkage, l); } } c--; c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, cons_of_domain(linkage, domain.type)); if (domain.type == 'z') { c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S"); } if (domain.type=='c') { c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S"); } if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) || (post_process_match("Rn", ctxt->constituent[c].start_link)==1)) { c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "SBAR"); } if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) || (post_process_match("MX#r", ctxt->constituent[c].start_link)==1)) { w = leftmost; if (strcmp(linkage->word[w], ",") == 0) w++; c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHNP"); } if (post_process_match("Mj", ctxt->constituent[c].start_link) == 1) { w = leftmost; if (strcmp(linkage->word[w], ",") == 0) w++; c = add_constituent(ctxt, c, linkage, &domain, w, w+1, "WHPP"); c = add_constituent(ctxt, c, linkage, &domain, w+1, w+1, "WHNP"); } if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) || (post_process_match("B#d", ctxt->constituent[c].start_link)==1)) { c = add_constituent(ctxt, c, linkage, &domain, rootleft, rootleft, "WHNP"); c = add_constituent(ctxt, c, linkage, &domain, rootleft, ctxt->constituent[c-1].right, "SBAR"); } if (post_process_match("CP", ctxt->constituent[c].start_link)==1) { if (strcmp(linkage->word[leftmost], ",") == 0) ctxt->constituent[c].left++; c = add_constituent(ctxt, c, linkage, &domain, 1, linkage->num_words-1, "S"); } if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) || (domain.type=='f')) { w = ctxt->constituent[c].left; if (strcmp(linkage->word[w], ",") == 0) w++; if (strcmp(linkage->word[w], "when") == 0) { c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHADVP"); } } if (domain.type=='t') { c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S"); } if ((post_process_match("QI", ctxt->constituent[c].start_link) == 1) || (post_process_match("Mr", ctxt->constituent[c].start_link) == 1) || (post_process_match("MX#d", ctxt->constituent[c].start_link) == 1)) { const char * name = ""; w = leftmost; if (strcmp(linkage->word[w], ",") == 0) w++; if (ctxt->wordtype[w] == NONE) name = "WHADVP"; else if (ctxt->wordtype[w] == QTYPE) name = "WHNP"; else if (ctxt->wordtype[w] == QDTYPE) name = "WHNP"; else assert(0, "Unexpected word type"); c = add_constituent(ctxt, c, linkage, &domain, w, w, name); if (ctxt->wordtype[w] == QDTYPE) { /* Now find the finite verb to the right, start an S */ /* Limit w2 to sentence length. */ // for( w2=w+1; w2 < ctxt->r_limit-1; w2++ ) for (w2 = w+1; w2 < rightmost; w2++) if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break; /* Adjust the right boundary of previous constituent */ ctxt->constituent[c].right = w2 - 1; c = add_constituent(ctxt, c, linkage, &domain, w2, rightmost, "S"); } } if (ctxt->constituent[c].domain_type == '\0') { err_ctxt ec; err_msg(&ec, Error, "Error: no domain type assigned to constituent\n"); } if (ctxt->constituent[c].start_link == NULL) { err_ctxt ec; err_msg(&ec, Error, "Error: no type assigned to constituent\n"); } } numcon_subl = c - numcon_total; /* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl); */ if (verbosity >= 2) printf("Constituents added at first stage:\n"); for (c = numcon_total; c < numcon_total + numcon_subl; c++) { print_constituent(ctxt, linkage, c); } /* Opener case - generates S around main clause. (This must be done first; the S generated will be needed for later cases.) */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", CASE_OPENER); /* pp opener case */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", CASE_PPOPEN); /* participle opener case */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", CASE_PART_OPEN); /* Subject-phrase case; every main VP generates an S */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", CASE_S); /* Relative clause case; an SBAR generates a complement NP */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", CASE_REL_CLAUSE); /* Participle modifier case */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", CASE_PART_MOD); /* PP modifying NP */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", CASE_PART_MOD); /* Appositive case */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", CASE_APPOS); /* S-V inversion case; an NP generates a complement VP */ numcon_subl = gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", CASE_SVINV); adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl); for (c = numcon_total; c < numcon_total + numcon_subl; c++) { if ((ctxt->constituent[c].domain_type=='p') && (strcmp(linkage->word[ctxt->constituent[c].left], ",")==0)) { ctxt->constituent[c].left++; } } /* Make sure the constituents are nested. If two constituents * are not nested: whichever constituent has the furthest left * boundary, shift that boundary rightwards to the left boundary * of the other one. */ while (true) { bool adjustment_made = false; for (c = numcon_total; c < numcon_total + numcon_subl; c++) { for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++) { if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) && (ctxt->constituent[c].right < ctxt->constituent[c2].right) && (ctxt->constituent[c].right >= ctxt->constituent[c2].left)) { /* We've found two overlapping constituents. If one is larger, except the smaller one includes an extra comma, adjust the smaller one to exclude the comma */ if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",") == 0) || (strcmp(linkage->word[ctxt->constituent[c2].right], "RIGHT-WALL") == 0)) { if (verbosity >= 2) printf("Adjusting %d to fix comma overlap\n", c2); adjust_for_right_comma(ctxt, linkage, c2); adjustment_made = true; } else if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0) { if (verbosity >= 2) printf("Adjusting c %d to fix comma overlap\n", c); adjust_for_left_comma(ctxt, linkage, c); adjustment_made = true; } else { if (verbosity >= 2) { err_ctxt ec; err_msg(&ec, Warn, "Warning: the constituents aren't nested! " "Adjusting them. (%d, %d)\n", c, c2); } ctxt->constituent[c].left = ctxt->constituent[c2].left; } } } } if (adjustment_made == false) break; } assert (numcon_total + numcon_subl < ctxt->conlen, "Too many constituents"); return numcon_subl; }