static FrList *string_to_List(const char *&input, const char *) { FrList *list, *prev ; FrObject *curr ; list = prev = 0 ; input++ ; // consume initial left parenthesis FramepaC_read_nesting_level++ ; char c ; while ((c = FrSkipWhitespace(input)) != ')' && c != '\0') { FrObject *obj = string_to_FrObject(input) ; if (obj == symbolPERIOD && FrSkipWhitespace(input) != ')') { // period is not last item, so check if it's a dotted pair curr = string_to_FrObject(input) ; // was period second-to-last in list? if ((c = FrSkipWhitespace(input)) == ')') { if (!list) prev = list = new FrList(0) ; prev->replacd(curr) ; break ; } else { if (!list) prev = list = new FrList(obj) ; else { obj = new FrList(obj) ; prev->replacd(obj) ; prev = (FrList*)obj ; } obj = curr ; } } curr = new FrList(obj) ; if (!list) list = (FrList*)curr ; else prev->replacd(curr) ; prev = (FrList*)curr ; } if (c != ')') expected_right_paren(list) ; else input++ ; if (--FramepaC_read_nesting_level <= 0 && FramepaC_read_associations) { FramepaC_read_associations->freeObject() ; FramepaC_read_associations = 0 ; } return list ; }
static FrList *read_List(istream &input, const char *) { FrList *list, *prev ; FrObject *curr ; list = prev = 0 ; input.get() ; // discard the initial left parenthesis FramepaC_read_nesting_level++ ; while (FrSkipWhitespace(input) != ')' && !input.eof() && !input.fail()) { FrObject *obj = read_FrObject(input) ; if (obj == symbolPERIOD && FrSkipWhitespace(input) != ')') { // period is not last item, so check if it's a dotted pair curr = read_FrObject(input) ; // was period second-to-last in list? if (FrSkipWhitespace(input) == ')') { if (!list) prev = list = new FrList(0) ; prev->replacd(curr) ; break ; } else { if (!list) prev = list = new FrList(obj) ; else { obj = new FrList(obj) ; prev->replacd(obj) ; prev = (FrList*)obj ; } obj = curr ; } } curr = new FrList(obj) ; if (!list) list = (FrList*)curr ; else prev->replacd(curr) ; prev = (FrList*)curr ; } if (input.get() != ')') expected_right_paren(list) ; if (--FramepaC_read_nesting_level <= 0 && FramepaC_read_associations) { FramepaC_read_associations->freeObject() ; FramepaC_read_associations = 0 ; } return list ; }
FrObject *string_to_Frame(const char *&input, const char *) { FrFrame *frame ; FrSymbol *name ; input++ ; // consume the initial left bracket name = string_to_Symbol(input) ; // read frame name if (name && name->symbolp()) // the name must be a symbol { frame = find_vframe_inline(name) ; if (!frame) frame = (read_as_VFrame && FramepaC_new_VFrame) ? FramepaC_new_VFrame(name) : new FrFrame(name) ; while (FrSkipWhitespace(input) == '[') string_to_Slot(input,frame) ; if (*input == ']') input++ ; else FrWarning(errmsg_frame_malformed) ; return frame ; } else { FrWarning(errmsg_frame_name) ; return 0 ; } }
static bool verify_List(const char *&input, const char *, bool) { input++ ; // consume initial left parenthesis char c ; while ((c = FrSkipWhitespace(input)) != ')' && c != '\0') { if (!valid_FrObject_string(input,true)) return false ; } if (c == ')') { input++ ; // skip terminating right paren return true ; // and indicate success } else return false ; }
static bool verify_Frame(const char *&input, const char *, bool) { input++ ; // consume the initial left bracket if (!verify_Symbol(input,true)) // check frame name return false ; while (FrSkipWhitespace(input) == '[') { if (!verify_Slot(input)) // check for well-formed slot repres. return false ; } if (*input == ']') { input++ ; // skip over terminator return true ; // and indicate success } else return false ; }
static void read_Slot(istream &input,FrFrame *frame) { input.get() ; // consume the left bracket FrSymbol *slot ; slot = read_Symbol(input) ; // get slot name if (!slot || !slot->symbolp()) // the name must be a symbol { FrWarning(errmsg_slot_symbol) ; free_object(slot) ; return ; } frame->createSlot(slot) ; while (FrSkipWhitespace(input) == '[') read_Facet(input,frame,slot) ; int ch ; ch = input.get() ; // get first non-whitespace character if (ch != ']') // ch may be EOF FrWarning(errmsg_slot_malformed) ; }
static bool verify_Facet(const char *&input) { input++ ; // consume the left bracket if (!verify_Symbol(input,true)) // check facet name return false ; char c ; while ((c = FrSkipWhitespace(input)) != ']' && c != '\0') { if (!valid_FrObject_string(input,true)) return false ; } if (c == ']') { input++ ; // skip over terminator return true ; // and indicate success } else return false ; }
static void string_to_Slot(const char *&input,FrFrame *frame) { FrSymbol *slot ; input++ ; // consume the left bracket slot = string_to_Symbol(input) ; // get slot name if (!slot || !slot->symbolp()) // the name must be a symbol { FrWarning(errmsg_slot_symbol) ; return ; } frame->createSlot(slot) ; while (FrSkipWhitespace(input) == '[') string_to_Facet(input,frame,slot) ; if (*input != ']') FrWarning(errmsg_slot_malformed) ; else input++ ; }
static void read_Facet(istream &input,FrFrame *frame,FrSymbol *slot) { input.get() ; // consume the left bracket FrSymbol *facet ; facet = read_Symbol(input) ; // get facet name if (!facet || !facet->symbolp()) // the name must be a symbol { FrWarning(errmsg_facet_symbol) ; free_object(facet) ; return ; } frame->createFacet(slot,facet) ; char ch ; while ((ch = FrSkipWhitespace(input)) != 0 && ch != ']') { frame->addFillerNoCopy(slot,facet,read_FrObject(input)) ; } if (input.get() != ']') FrWarning(errmsg_facet_malformed) ; }
static FrObject *read_Frame(istream &input, const char *) { input.get() ; // consume the initial left bracket FrSymbol *name ; name = read_Symbol(input) ; // read frame name if (!name || !name->symbolp()) // the name must be a symbol { FrWarning(errmsg_frame_name) ; free_object(name) ; return 0 ; } FrFrame *frame = find_vframe_inline(name) ; if (!frame) frame = (read_as_VFrame && FramepaC_new_VFrame) ? FramepaC_new_VFrame(name) : new FrFrame(name) ; while (FrSkipWhitespace(input) == '[') read_Slot(input,frame) ; if (input.get() != ']') // next non-whitespace char may be EOF FrWarning(errmsg_frame_malformed) ; return frame ; }
static void string_to_Facet(const char *&input,FrFrame *frame,FrSymbol *slot) { FrSymbol *facet ; input++ ; // consume the left bracket facet = string_to_Symbol(input) ; // get facet name if (!facet || !facet->symbolp()) // the name must be a symbol { FrWarning(errmsg_facet_symbol) ; return ; } frame->createFacet(slot,facet) ; char c ; while ((c = FrSkipWhitespace(input)) != ']' && c != '\0') { frame->addFillerNoCopy(slot,facet,string_to_FrObject(input)) ; } if (c != ']') FrWarning(errmsg_facet_malformed) ; else input++ ; }
static char *next_abbreviation(FILE *fp) { static char line[FrMAX_LINE] ; if (feof(fp) || !fgets(line,sizeof(line),fp)) return 0 ; char *lineptr = line ; FrSkipWhitespace(lineptr) ; // strip off comments char *cmt = strchr(lineptr,';') ; if (cmt) *cmt = '\0' ; // use only the first token on the line for (char *lp = lineptr ; *lp ; lp++) { if (Fr_isspace(*lp)) { *lp = '\0' ; break ; } } return lineptr ; }
bool FrTextSpan::parse(const FrList *span, FrTextSpans *contain) { if (span && span->consp() && span->simplelistlength() >= 2 && span->first() && span->second() && span->first()->numberp() && span->second()->numberp()) { size_t sp_start = span->first()->intValue() ; size_t sp_end = span->second()->intValue() ; span = span->rest()->rest() ; // we'll allow rather free-form input from the rest of the span's // description: the first two numbers are the score and weight, // respectively, the first string is the span's text, the second // string (if present) becomes the INIT_TEXT metadata. Then, // the first structure (if present) is the metadata, and any lists // starting with a symbol become additional metadata fields double sp_score = DEFAULT_SCORE ; double sp_weight = DEFAULT_WEIGHT ; // scan for the first two numbers for (const FrList *sp = span ; sp ; sp = sp->rest()) { if (sp->first() && sp->first()->numberp()) { sp_score = sp->first()->floatValue() ; for (sp = sp->rest() ; sp ; sp = sp->rest()) { if (sp->first() && sp->first()->numberp()) { sp_weight = sp->first()->floatValue() ; break ; } } break ; } } const char *curr_text = 0 ; const char *orig_text = 0 ; // scan for the first two strings or symbols for (const FrList *sp = span ; sp ; sp = sp->rest()) { FrObject *item = sp->first() ; if (item && (item->stringp() || item->symbolp())) { curr_text = item->printableName() ; for (sp = sp->rest() ; sp ; sp = sp->rest()) { item = sp->first() ; if (item && (item->stringp() || item->symbolp())) { orig_text = item->printableName() ; break ; } } break ; } } if (curr_text) (void)FrSkipWhitespace(curr_text) ; if (orig_text) (void)FrSkipWhitespace(orig_text) ; // scan for the first structure const FrStruct *meta = 0 ; for (const FrList *sp = span ; sp ; sp = sp->rest()) { if (sp->first() && sp->first()->structp()) { meta = (FrStruct*)sp->first() ; break ; } } init(sp_start,sp_end,sp_score,sp_weight,curr_text,contain) ; free_object(m_metadata) ; if (meta) { FrSymbol *symMETATYPE = FrSymbolTable::add(METADATA_TYPENAME) ; if (meta->typeName() == symMETATYPE) m_metadata = (FrStruct*)meta->deepcopy() ; else { // copy the keywords one by one FrList *keys = meta->fieldNames() ; while (keys) { FrSymbol *key = (FrSymbol*)poplist(keys) ; setMetaData(key,meta->get(key)) ; } } } if (orig_text) setMetaData(FrSymbolTable::add(init_text_tag), new FrString(orig_text),false) ; // finally, scan for any embedded lists and add them as metadata fields for (const FrList *sp = span ; sp ; sp = sp->rest()) { FrList *item = (FrList*)sp->first() ; if (item && item->consp() && item->first() && item->first()->symbolp()) { FrSymbol *key = (FrSymbol*)item->first() ; setMetaData(key,item->rest()) ; } } return true ; } return false ; }
bool FrTFIDF::load(const char *filename) { if (filename && *filename) { FrITextFile wt(filename) ; if (!wt.good()) { FrWarningVA("unable to open term weights file '%s'",filename) ; return false ; } delete ht ; ht = new FrSymHashTable ; FrSymbol *symEOF = FrSymbolTable::add("*EOF*") ; char *line = wt.getline() ; bool expanded = false ; if (line && strncmp(line,"!!! ",4) == 0) { char *end = 0 ; total_docs = (size_t)strtol(line+4,&end,10) ; if (end && end != line+4) { char *tmp = end ; size_t vocab_size = (size_t)strtol(tmp,&end,10) ; if (vocab_size > 0 && end && end != tmp) { ht->expand(vocab_size+100) ; expanded = true ; } } } if (!expanded) // ensure some reasonable starting size ht->expand(5000) ; while ((line = wt.getline()) != 0) { if (FrSkipWhitespace(line) == ';' || *line == '\0') continue ; const char *origline = line ; FrSymbol *term = (FrSymbol*)string_to_FrObject(line) ; if (term == symEOF || !term || !term->symbolp()) { FrWarning("invalid line in term-weights file") ; free_object(term) ; continue ; } char *end = 0 ; size_t term_freq = strtol(line,&end,10) ; if (end && end != line) { line = end ; size_t doc_freq = strtol(line,&end,10) ; if (end != line) { if (doc_freq > 0 && term_freq > 0) { FrSymHashEntry *entry = tfidfRecord(term) ; FrTFIDFrecord *rec = new FrTFIDFrecord(term_freq,doc_freq) ; if (entry) { delete (FrTFIDFrecord*)entry->getUserData() ; entry->setUserData(rec) ; } else ht->add(term,(void*)rec) ; continue ; } FrWarning("invalid data in term-weights file -- both term\n" "\tand document frequencies must be nonzero") ; free_object(term) ; continue ; } } FrWarningVA("expected two integers following the term '%s'; line was\n" "\t%s", term->symbolName(), origline) ; free_object(term) ; } return true ; } return false ; }