static void ScanArticle(streamtokenizer *st, void* userData) { rssFeedData *data = userData; articleData* article = AddArticle(&data->articles,&data->item); int numWords = 0; char word[1024]; char longestWord[1024] = {'\0'}; while (STNextToken(st, word, sizeof(word))) { if (strcasecmp(word, "<") == 0) { SkipIrrelevantContent(st); // in html-utls.h } else { RemoveEscapeCharacters(word); if (WordIsWellFormed(word)) { char* dummy = word;//need this becouse cant do &word in c if(HashSetLookup(&data->stopWords,&dummy)==NULL){// skip stopwords indexData *entry = addWordRecord(&data->indices, word); indexWord(&entry->data,article); numWords++; if (strlen(word) > strlen(longestWord)) strcpy(longestWord, word); } } } } printf("\tWe counted %d well-formed words [including duplicates].\n", numWords); printf("\tThe longest word scanned was \"%s\".", longestWord); if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) printf(" [Ooooo... long word!]"); printf("\n"); }
void CParserOptions::InitFromConfigObject(const TTextMinerConfig& config) { for (int i = 0; i < config.facts_size(); ++i) { TTextMinerConfig::TFactTypeRef ref = config.get_idx_facts(i); AddFactToShow(ref.name(), ref.nonequality()); } for (int i = 0; i < config.articles_size(); ++i) { TTextMinerConfig::TArticleRef ref = config.get_idx_articles(i); AddArticle(UTF8ToWide(ref.name())); } for (int i = 0; i < config.situations_size(); ++i) { TTextMinerConfig::TArticleRef ref = config.get_idx_situations(i); AddSituation(UTF8ToWide(ref.name())); } SetIgnoreUpperCase(config.GetIgnoreUpperCase()); m_maxNamesCount = config.GetMaxNamesCount(); }
/**************************************************************** AddArticleXOVER() This function accepts article header information formatted in the standard NNTP XOVER format. param LPSTR articleHeader - An XOVER formated string that hold the raw data to be parsed as the header information RETURN UTE_SUCCESS - success UTE_PARAMETER_INVALID_VALUE - invalid pointer parameter ****************************************************************/ int CUT_NNTPArticleList::AddArticleXOVER(LPSTR articleHeader) { char temp[1024]; UT_ARTICLEINFOA workArticle; if(articleHeader == NULL) return UTE_PARAMETER_INVALID_VALUE; int fieldCounter = 0; // Counter for the number of fields in the header tokened by a tab Or NULL // scan string for two tabs together and insert a space between // them. This is necessary to ensure that the strtok function // will not get gummed up. strtok will skip leading tokens in // a string, so if two tokens are side by side, the function, // will skip one of the parameters rather than returning a null // parameter. if(strstr(articleHeader, "\t\t") != NULL) { char buf[2500]; int i = 0, j = 0; do { if (articleHeader[i] == '\t' && articleHeader[i+1] == '\t') { buf[j++]=articleHeader[i++]; buf[j++]=' '; buf[j++]=articleHeader[i++]; } else buf[j++]=articleHeader[i++]; } while (articleHeader[i] != '\0'); buf[j] = '\0'; strcpy(articleHeader, buf); } int nId = 0; // by default int nSubject = 0; int nXref = 0; int nFrom = 0; int nDate = 0; int nByte = 0; int nMsgId = 0; int nNumberOfLines = 0; int nRefrence = 0; int nTopic = 0; /* Updated Version 4.x Each line of output will be formatted with the article number, followed by each of the headers in the overview database or the article itself (when the data is not available in the overview database) for that article separated by a tab character. The sequence of fields must be in this order: subject, author, date, message-id, references, byte count, and line count. Other optional fields may follow line count. Other optional fields may follow line count. These fields are specified by examining the response to the LIST OVERVIEW.FMT command. Where no data exists, a null field must be provided (i.e. the output will have two tab characters adjacent to each other). Servers should not output fields for articles that have been removed since the XOVER database was created. GW */ for (int loop = 0; loop < m_OverviewFormat.GetCount (); loop++ ) { if (nSubject == 0 && stricmp(m_OverviewFormat.GetString (loop),"Subject") == 0) { nSubject = loop+1; } else if (nFrom == 0 && stricmp(m_OverviewFormat.GetString (loop),"From") == 0) { nFrom = loop+1; } else if (nDate == 0 && stricmp(m_OverviewFormat.GetString (loop),"Date") == 0) { nDate = loop+1; } else if (nXref == 0 && stricmp(m_OverviewFormat.GetString (loop),"Xref") == 0) { nXref = loop+1; } else if (nMsgId == 0 && stricmp(m_OverviewFormat.GetString (loop),"Message-Id") == 0) { nMsgId = loop+1; } else if (nByte == 0 && stricmp(m_OverviewFormat.GetString (loop),"Bytes") == 0) { nByte = loop+1; } else if (nNumberOfLines == 0 && stricmp(m_OverviewFormat.GetString (loop),"Lines") == 0) { nNumberOfLines = loop+1; } else if (nRefrence == 0 && stricmp(m_OverviewFormat.GetString (loop),"References") == 0) { nRefrence = loop+1; } else if (nTopic == 0 && stricmp(m_OverviewFormat.GetString (loop),"Thread-Topic") == 0) { nTopic = loop+1; } } // string operation is way too consuming // Get the number of peices fieldCounter = CUT_StrMethods::GetParseStringPieces(articleHeader, "\t"); CUT_StrMethods::ParseString (articleHeader,"\t",nId, &(workArticle.nArticleId)); if (nSubject > 0) { CUT_StrMethods::ParseString (articleHeader,"\t",nSubject,temp,sizeof(temp)-1); workArticle.lpszSubject = new char[strlen(temp)+1]; strcpy(workArticle.lpszSubject, temp); } if (nFrom > 0) { CUT_StrMethods::ParseString (articleHeader,"\t",nFrom,temp,sizeof(temp)-1); workArticle.lpszAuthor = new char[strlen(temp)+1]; strcpy(workArticle.lpszAuthor, temp); } // Date if (nDate > 0) { CUT_StrMethods::ParseString (articleHeader,"\t",nDate,temp,sizeof(temp)-1); workArticle.lpszDate = new char[strlen(temp)+1]; strcpy(workArticle.lpszDate, temp); } if (nMsgId > 0) { CUT_StrMethods::ParseString (articleHeader,"\t",nMsgId,temp,sizeof(temp)-1); workArticle.lpszMessageId = new char[strlen(temp)+1]; strcpy(workArticle.lpszMessageId, temp); } if (nRefrence > 0) { CUT_StrMethods::ParseString (articleHeader,"\t",nRefrence,temp,sizeof(temp)-1); workArticle.lpszReferences = new char[strlen(temp)+1]; strcpy(workArticle.lpszReferences, temp); } // bytecount if (nByte > 0) { long numbByte = 0; CUT_StrMethods::ParseString (articleHeader,"\t",nByte, &numbByte ); workArticle.nByteCount = (int )numbByte ; } if (nNumberOfLines > 0) { long numbByte = 0; CUT_StrMethods::ParseString (articleHeader,"\t",nNumberOfLines, &numbByte ); workArticle.nLineCount = (int )numbByte ; } AddArticle(workArticle); return UTE_SUCCESS; }