void do_cqi_cl_struc2str(void) { int *struclist; int len, i; char *a, *str; Attribute *attribute; a = cqi_read_string(); len = cqi_read_int_list(&struclist); if (server_debug) { Rprintf( "CQi: CQI_CL_STRUC2STR('%s', [", a); for (i=0; i<len; i++) Rprintf( "%d ", struclist[i]); Rprintf( "])\n"); } attribute = cqi_lookup_attribute(a, ATT_STRUC); if (attribute == NULL) { cqi_command(cqi_errno); } else { /* we assemble the CQI_DATA_STRING_LIST() return command by hand, so we don't have to allocate a temporary list */ cqi_send_word(CQI_DATA_STRING_LIST); cqi_send_int(len); /* list size */ for (i=0; i<len; i++) { str = cl_struc2str(attribute, struclist[i]); cqi_send_string(str); /* sends "" if str == NULL (wrong alignment number) */ } } cqi_flush(); if (struclist != NULL) free(struclist); /* don't forget to free allocated memory */ free(a); }
/** * Prints a starting tag for each s-attribute. */ void decode_print_surrounding_s_att_values(int position) { int i; char *tagname; for (i = 0; i < printValuesIndex; i++) { if (printValues[i]) { char *sval; int snum; snum = cl_cpos2struc(printValues[i], position); if (snum >= 0) { /* if it is a p- or a- attribute, snum is a CL error (less than 0) */ sval = decode_string_escape(cl_struc2str(printValues[i], snum)); tagname = printValues[i]->any.name; switch (mode) { case ConclineMode: printf("<%s %s>: ", tagname, sval); break; case LispMode: printf("(VALUE %s \"%s\")\n", tagname, sval); break; case XMLMode: printf("<element name=\"%s\" value=\"%s\"/>\n", tagname, sval); break; case EncodeMode: /* pretends to be a comment, but has to be stripped before feeding output to encode */ printf("# %s=%s\n", tagname, sval); break; case StandardMode: default: printf("<%s %s>\n", tagname, sval); break; } } else { /* don't print tag if start position is not in region */ } } } }
/* * ------------------------------------------------------------------------ * * "rcqpCmd_struc2str(SEXP inAttribute, SEXP inIds)" -- * * Function gets value of struc_num'th instance of the specified s-attribute. * * ------------------------------------------------------------------------ */ SEXP rcqpCmd_struc2str(SEXP inAttribute, SEXP inIds) { SEXP result = R_NilValue; int idx; int len, i; char *a, *str; Attribute * attribute; if (!isString(inAttribute) || length(inAttribute) != 1) error("argument 'attribute' must be a string"); PROTECT(inAttribute); if (!isVector(inIds)) error("argument 'ids' must be a vector of integers"); PROTECT(inIds); a = (char*)CHAR(STRING_ELT(inAttribute,0)); len = length(inIds); attribute = cqi_lookup_attribute(a, ATT_STRUC); if (attribute == NULL) { UNPROTECT(2); rcqp_error_code(cqi_errno); } else { result = PROTECT(allocVector(STRSXP, len)); for (i=0; i<len; i++) { idx = INTEGER(inIds)[i]; str = cl_struc2str(attribute, idx); /* Sends "" if str == NULL (cpos out of range) */ if (str != NULL) { SET_STRING_ELT(result, i, mkChar(str)); } } } UNPROTECT(3); return result; }
/** * Prints out the requested attributes for a sequence of tokens * (or a single token if end_position == -1). * * If the -c flag was used (and, thus, the context parameter is not NULL), * then the sequence is extended to the entire s-attribute region (in matchlist mode). */ void decode_print_token_sequence(int start_position, int end_position, Attribute *context) { int alg, aligned_start, aligned_end, aligned_start2, aligned_end2, rng_start, rng_end, snum; int start_context, end_context, dummy; int lastposa, i, w; /* pointer used for values of p-attributes */ char *wrd; start_context = start_position; end_context = (end_position >= 0) ? end_position : start_position; /* above ensures that in non-matchlist mode (where ep == -1), we only print one token */ if (context != NULL) { /* expand the start_context end_context numbers to the start * and end points of the containing region of the context s-attribute */ if (!cl_cpos2struc2cpos(context, start_position, &start_context, &end_context)) { start_context = start_position; end_context = (end_position >= 0) ? end_position : start_position; } else if (end_position >= 0) { if (!cl_cpos2struc2cpos(context, end_position, &dummy, &end_context)) { end_context = (end_position >= 0) ? end_position : start_position; } } /* indicate that we're showing context */ switch (mode) { case LispMode: printf("(TARGET %d\n", start_position); if (end_position >= 0) printf("(INTERVAL %d %d)\n", start_position, end_position); break; case EncodeMode: case ConclineMode: /* nothing here */ break; case XMLMode: printf("<context start=\"%d\" end=\"%d\"/>\n", start_context, end_context); break; case StandardMode: default: if (end_position >= 0) { printf("INTERVAL %d %d\n", start_position, end_position); } else { printf("TARGET %d\n", start_position); } break; } } /* endif context != NULL */ /* some extra information in -L and -H modes */ if (mode == LispMode && end_position != -1) printf("(CONTEXT %d %d)\n", start_context, end_context); else if (mode == ConclineMode) { if (printnum) printf("%8d: ", start_position); } /* now print the token sequence (including context) with all requested attributes */ for (w = start_context; w <= end_context; w++) { int beg_of_line; /* extract s-attribute regions for start and end tags into s_att_regions[] */ N_sar = 0; /* counter and index */ for (i = 0; i < print_list_index; i++) { if (print_list[i]->any.type == ATT_STRUC) { if ( ((snum = cl_cpos2struc(print_list[i], w)) >= 0) && (cl_struc2cpos(print_list[i], snum, &rng_start, &rng_end)) && ((w == rng_start) || (w == rng_end)) ) { s_att_regions[N_sar].name = print_list[i]->any.name; s_att_regions[N_sar].start = rng_start; s_att_regions[N_sar].end = rng_end; if (cl_struc_values(print_list[i])) s_att_regions[N_sar].annot = cl_struc2str(print_list[i], snum); else s_att_regions[N_sar].annot = NULL; N_sar++; } } } decode_sort_s_att_regions(); /* sort regions to ensure proper nesting of start and end tags */ /* show corpus positions with -n option */ if (printnum) switch (mode) { case LispMode: printf("(%d ", w); break; case EncodeMode: printf("%8d\t", w); break; case ConclineMode: /* nothing here (shown at start of line in -H mode) */ break; case XMLMode: /* nothing here */ break; case StandardMode: default: printf("%8d: ", w); break; } else { if (mode == LispMode) printf("("); /* entire match is parenthesised list in -L mode */ } lastposa = -1; /* print start tags (s- and a-attributes) with -C,-H,-X */ if ((mode == EncodeMode) || (mode == ConclineMode) || (mode == XMLMode)) { /* print a-attributes from print_list[] */ for (i = 0; i < print_list_index; i++) { switch (print_list[i]->any.type) { case ATT_ALIGN: if ( ((alg = cl_cpos2alg(print_list[i], w)) >= 0) && (cl_alg2cpos(print_list[i], alg, &aligned_start, &aligned_end, &aligned_start2, &aligned_end2)) && (w == aligned_start) ) { if (mode == XMLMode) { printf("<align type=\"start\" target=\"%s\"", print_list[i]->any.name); if (printnum) printf(" start=\"%d\" end=\"%d\"", aligned_start2, aligned_end2); printf("/>\n"); } else { printf("<%s", print_list[i]->any.name); if (printnum) printf(" %d %d", aligned_start2, aligned_end2); printf(">%c", (mode == EncodeMode) ? '\n' : ' '); } } break; default: /* ignore all other attribute types */ break; } } /* print s-attributes from s_att_regions[] (using sar_sort_index[]) */ for (i = 0; i < N_sar; i++) { SAttRegion *region = &(s_att_regions[sar_sort_index[i]]); if (region->start == w) { if (mode == XMLMode) { printf("<tag type=\"start\" name=\"%s\"", region->name); if (printnum) printf(" cpos=\"%d\"", w); if (region->annot) printf(" value=\"%s\"", decode_string_escape(region->annot)); printf("/>\n"); } else { printf("<%s%s%s>%c", region->name, region->annot ? " " : "", region->annot ? region->annot : "", (mode == ConclineMode ? ' ' : '\n')); } } } } /* now print token with its attribute values (p-attributes only for -C,-H,-X) */ if (mode == XMLMode) { printf("<token"); if (printnum) printf(" cpos=\"%d\"", w); printf(">"); } beg_of_line = 1; /* Loop printing each attribute for this cpos (w) */ for (i = 0; i < print_list_index; i++) { switch (print_list[i]->any.type) { case ATT_POS: lastposa = i; if ((wrd = decode_string_escape(cl_cpos2str(print_list[i], w))) != NULL) { switch (mode) { case LispMode: printf("(%s \"%s\")", print_list[i]->any.name, wrd); break; case EncodeMode: if (beg_of_line) { printf("%s", wrd); beg_of_line = 0; } else printf("\t%s", wrd); break; case ConclineMode: if (beg_of_line) { printf("%s", wrd); beg_of_line = 0; } else printf("/%s", wrd); break; case XMLMode: printf(" <attr name=\"%s\">%s</attr>", print_list[i]->any.name, wrd); break; case StandardMode: default: printf("%s=%s\t", print_list[i]->any.name, wrd); break; } } else { cl_error("(aborting) cl_cpos2str() failed"); decode_cleanup(1); } break; case ATT_ALIGN: /* do not print in encode, concline or xml modes because already done (above) */ if ((mode != EncodeMode) && (mode != ConclineMode) && (mode != XMLMode)) { if ( ((alg = cl_cpos2alg(print_list[i], w)) >= 0) && (cl_alg2cpos(print_list[i], alg, &aligned_start, &aligned_end, &aligned_start2, &aligned_end2)) ) { if (mode == LispMode) { printf("(ALG %d %d %d %d)", aligned_start, aligned_end, aligned_start2, aligned_end2); } else { printf("%d-%d==>%s:%d-%d\t", aligned_start, aligned_end, print_list[i]->any.name, aligned_start2, aligned_end2); } } else if (cl_errno != CDA_OK) { cl_error("(aborting) alignment error"); decode_cleanup(1); } } break; case ATT_STRUC: /* do not print in encode, concline or xml modes because already done (above) */ if ((mode != EncodeMode) && (mode != ConclineMode) && (mode != XMLMode)) { if (cl_cpos2struc2cpos(print_list[i], w, &rng_start, &rng_end)) { /* standard and -L mode don't show tag annotations */ printf(mode == LispMode ? "(STRUC %s %d %d)" : "<%s>:%d-%d\t", print_list[i]->any.name, rng_start, rng_end); } else if (cl_errno != CDA_OK) cl_error("(aborting) cl_cpos2struc2cpos() failed"); } break; case ATT_DYN: /* dynamic attributes aren't implemented */ default: break; } } /* print token separator (or end of token in XML mode) */ switch (mode) { case LispMode: printf(")\n"); break; case ConclineMode: printf(" "); break; case XMLMode: printf(" </token>\n"); break; case EncodeMode: case StandardMode: default: printf("\n"); break; } /* now, after printing all the positional attributes, print end tags with -H,-C,-X */ if (mode == EncodeMode || mode == ConclineMode || mode == XMLMode) { /* print s-attributes from s_att_regions[] (using sar_sort_index[] in reverse order) */ for (i = N_sar - 1; i >= 0; i--) { SAttRegion *region = &(s_att_regions[sar_sort_index[i]]); if (region->end == w) { if (mode == XMLMode) { printf("<tag type=\"end\" name=\"%s\"", region->name); if (printnum) printf(" cpos=\"%d\"", w); printf("/>\n"); } else { printf("</%s>%c", region->name, (mode == ConclineMode ? ' ' : '\n')); } } } /* print a-attributes from print_list[] */ for (i = 0; i < print_list_index; i++) { switch (print_list[i]->any.type) { case ATT_ALIGN: if ( ((alg = cl_cpos2alg(print_list[i], w)) >= 0) && (cl_alg2cpos(print_list[i], alg, &aligned_start, &aligned_end, &aligned_start2, &aligned_end2)) && (w == aligned_end) ) { if (mode == XMLMode) { printf("<align type=\"end\" target=\"%s\"", print_list[i]->any.name); if (printnum) printf(" start=\"%d\" end=\"%d\"", aligned_start2, aligned_end2); printf("/>\n"); } else { printf("</%s", print_list[i]->any.name); if (printnum) printf(" %d %d", aligned_start2, aligned_end2); printf(">%c", (mode == EncodeMode) ? '\n' : ' '); } } break; default: /* ignore all other attribute types */ break; } } } /* end of print end tags */ } /* end of match range loop: for w from start_context to end_context */ /* end of match (for matchlist mode in particular) */ if ((context != NULL) && (mode == LispMode)) printf(")\n"); else if (mode == ConclineMode) printf("\n"); return; }
/** * Main function for cwb-s-encode. * * @param argc Number of command-line arguments. * @param argv Command-line arguments. */ int main(int argc, char **argv) { int input_line; int start, end; char *annot; char buf[CL_MAX_LINE_LENGTH]; Attribute *att; int V_switch, values, S_annotations_dropped; int i, N; progname = argv[0]; sencode_parse_options(argc, argv); /* -a mode: read existing regions into memory */ if (add_to_existing) { if (corpus == NULL) { Rprintf( "Error: You have to specify source corpus (-C <corpus>) for -a switch.\n"); rcqp_receive_error(1); } att = cl_new_attribute(corpus, new_satt.name, ATT_STRUC); if ((att != NULL) && (cl_max_struc(att) > 0)) { V_switch = new_satt.store_values; values = cl_struc_values(att); if (V_switch && (!values)) { Rprintf( "Error: Existing regions of -V attribute have no annotations.\n"); rcqp_receive_error(1); } else if ((!V_switch) && values) { Rprintf( "Error: Existing regions of -S attributes have annotations.\n"); rcqp_receive_error(1); } if (!silent) Rprintf("[Loading previous <%s> regions]\n", new_satt.name); N = cl_max_struc(att); for (i = 0; i < N; i++) { cl_struc2cpos(att, i, &start, &end); annot = cl_struc2str(att, i); SL_insert(start, end, annot); } } else { if (!silent) Rprintf("[No <%s> regions defined (skipped)]\n", new_satt.name); } } /* loop reading input (stdin or -f <file>) */ if (in_memory && (!silent)) Rprintf("[Reading input data]\n"); input_line = 0; S_annotations_dropped = 0; while (fgets(buf, CL_MAX_LINE_LENGTH, text_fd)) { input_line++; /* check for buffer overflow */ if (strlen(buf) >= (CL_MAX_LINE_LENGTH - 1)) { Rprintf( "BUFFER OVERFLOW, input line #%d is too long:\n>> %s", input_line, buf); rcqp_receive_error(1); } if (! sencode_parse_line(buf, &start, &end, &annot)) { Rprintf( "FORMAT ERROR on line #%d:\n>> %s", input_line, buf); rcqp_receive_error(1); } if (new_satt.store_values && (annot == NULL)) { Rprintf( "MISSING ANNOTATION on line #%d:\n>> %s", input_line, buf); rcqp_receive_error(1); } if ((!new_satt.store_values) && (annot != NULL)) { if (! S_annotations_dropped) Rprintf( "WARNING: Annotation for -S attribute ignored on line #%d (warning issued only once):\n>> %s", input_line, buf); S_annotations_dropped++; } if ((start <= new_satt.last_cpos) || (end < start)) { Rprintf( "RANGE INCONSISTENCY on line #%d:\n>> %s(end of previous region was %d)\n", input_line, buf, new_satt.last_cpos); rcqp_receive_error(1); } if (annot != NULL && set_att != set_none) { /* convert set annotation into standard syntax */ annot = sencode_check_set(annot); if (annot == NULL) { Rprintf( "SET ANNOTATION SYNTAX ERROR on line #%d:\n>> %s", input_line, buf); rcqp_receive_error(1); } } /* debugging output */ if (debug) { Rprintf( "[%d, %d]", start, end); if (annot != NULL) Rprintf( " <%s>", annot); Rprintf( "\n"); } /* in -M mode, store this region in memory; otherwise write it to the disk files */ if (in_memory) SL_insert(start, end, annot); else sencode_write_region(start, end, annot); cl_free(annot); } /* in -M mode, write data to disk now that we have finished looping across input data */ if (in_memory) { SL item; if (!silent) Rprintf("[Creating encoded disk file(s)]\n"); SL_rewind(); while ((item = SL_next()) != NULL) sencode_write_region(item->start, item->end, item->annot); } /* close files */ sencode_close_files(); if (S_annotations_dropped > 0) Rprintf( "Warning: %d annotation values dropped for -S attribute '%s'.\n", S_annotations_dropped, new_satt.name); rcqp_receive_error(0); }
Group *compute_grouping(CorpusList *cl, FieldType source_field, int source_offset, char *source_attr_name, FieldType target_field, int target_offset, char *target_attr_name, int cutoff_freq) { Group *group; Attribute *source_attr, *target_attr; int source_is_struc = 0, target_is_struc = 0; char *source_base = NULL, *target_base = 0; if ((cl == NULL) || (cl->corpus == NULL)) { cqpmessage(Warning, "Grouping:\nCan't access corpus."); return NULL; } if ((cl->size == 0) || (cl->range == NULL)) { cqpmessage(Warning, "Corpus %s is empty, no grouping possible", cl->name); return NULL; } if ((source_attr_name == NULL) && (source_field == NoField)) { source_attr = NULL; } else { source_attr = find_attribute(cl->corpus, source_attr_name, ATT_POS, NULL); if (source_attr == NULL) { source_attr = find_attribute(cl->corpus, source_attr_name, ATT_STRUC, NULL); source_is_struc = 1; } if (source_attr == NULL) { cqpmessage(Error, "Can't find attribute ``%s'' for named query %s", source_attr_name, cl->name); return NULL; } if (source_is_struc) { if (cl_struc_values(source_attr)) { source_base = cl_struc2str(source_attr, 0); /* should be beginning of the attribute's lexicon */ assert(source_base && "Internal error. Please don't use s-attributes in group command."); } else { cqpmessage(Error, "No annotated values for s-attribute ``%s'' in named query %s", source_attr_name, cl->name); return NULL; } } switch (source_field) { case KeywordField: if (cl->keywords == NULL) { cqpmessage(Error, "No keyword anchors defined for %s", cl->name); return NULL; } break; case TargetField: if (cl->targets == NULL) { cqpmessage(Error, "No target anchors defined for %s", cl->name); return NULL; } break; case MatchField: case MatchEndField: assert(cl->range && cl->size > 0); break; case NoField: default: cqpmessage(Error, "Illegal second anchor in group command"); return NULL; break; } } target_attr = find_attribute(cl->corpus, target_attr_name, ATT_POS, NULL); if (target_attr == NULL) { target_attr = find_attribute(cl->corpus, target_attr_name, ATT_STRUC, NULL); target_is_struc = 1; } if (target_attr == NULL) { cqpmessage(Error, "Can't find attribute ``%s'' for named query %s", target_attr_name, cl->name); return NULL; } if (target_is_struc) { if (cl_struc_values(target_attr)) { target_base = cl_struc2str(target_attr, 0); /* should be beginning of the attribute's lexicon */ assert(target_base && "Internal error. Please don't use s-attributes in group command."); } else { cqpmessage(Error, "No annotated values for s-attribute ``%s'' in named query %s", target_attr_name, cl->name); return NULL; } } switch (target_field) { case KeywordField: if (cl->keywords == NULL) { cqpmessage(Error, "No keyword anchors defined for %s", cl->name); return NULL; } break; case TargetField: if (cl->targets == NULL) { cqpmessage(Error, "No target anchors defined for %s", cl->name); return NULL; } break; case MatchField: case MatchEndField: assert(cl->range && cl->size > 0); break; case NoField: default: cqpmessage(Error, "Illegal anchor in group command"); return NULL; break; } /* set up Group object */ group = (Group *) cl_malloc(sizeof(Group)); group->my_corpus = cl; group->source_attribute = source_attr; group->source_offset = source_offset; group->source_is_struc = source_is_struc; group->source_base = source_base; group->source_field = source_field; group->target_attribute = target_attr; group->target_offset = target_offset; group->target_is_struc = target_is_struc; group->target_base = target_base; group->target_field = target_field; group->nr_cells = 0; group->count_cells = NULL; group->cutoff_frequency = cutoff_freq; if (UseExternalGrouping && !insecure && !(source_is_struc || target_is_struc)) return ComputeGroupExternally(group); /* modifies Group object in place and returns pointer or NULL */ else return ComputeGroupInternally(group); }