示例#1
0
static void ScanArticle(streamtokenizer *st, void* userData)
{
  rssFeedData *data = userData;
  articleData* article = AddArticle(&data->articles,&data->item);
  
  int numWords = 0;
  char word[1024];
  char longestWord[1024] = {'\0'};

  while (STNextToken(st, word, sizeof(word))) {
    if (strcasecmp(word, "<") == 0) {
      SkipIrrelevantContent(st); // in html-utls.h
    } else {
      RemoveEscapeCharacters(word);
      if (WordIsWellFormed(word)) {
	char* dummy = word;//need this becouse cant do &word in c
	if(HashSetLookup(&data->stopWords,&dummy)==NULL){// skip stopwords

	  indexData *entry = addWordRecord(&data->indices, word);
	  indexWord(&entry->data,article);
	  
	  numWords++;
	  if (strlen(word) > strlen(longestWord))
	    strcpy(longestWord, word);
	}
      }
    }
  }

  printf("\tWe counted %d well-formed words [including duplicates].\n", numWords);
  printf("\tThe longest word scanned was \"%s\".", longestWord);
  if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL)) 
    printf(" [Ooooo... long word!]");
  printf("\n");
}
void CParserOptions::InitFromConfigObject(const TTextMinerConfig& config) {
    for (int i = 0; i < config.facts_size(); ++i) {
        TTextMinerConfig::TFactTypeRef ref = config.get_idx_facts(i);
        AddFactToShow(ref.name(), ref.nonequality());
    }

    for (int i = 0; i < config.articles_size(); ++i) {
        TTextMinerConfig::TArticleRef ref = config.get_idx_articles(i);
        AddArticle(UTF8ToWide(ref.name()));
    }

    for (int i = 0; i < config.situations_size(); ++i) {
        TTextMinerConfig::TArticleRef ref = config.get_idx_situations(i);
        AddSituation(UTF8ToWide(ref.name()));
    }

    SetIgnoreUpperCase(config.GetIgnoreUpperCase());

    m_maxNamesCount = config.GetMaxNamesCount();
}
示例#3
0
/****************************************************************
AddArticleXOVER()
      This function accepts article header information
      formatted in the standard NNTP XOVER format.  
param
  LPSTR articleHeader -  An XOVER formated string that hold the raw data 
        to be parsed as the header information
RETURN
    UTE_SUCCESS                 - success    
    UTE_PARAMETER_INVALID_VALUE - invalid pointer parameter
****************************************************************/
int CUT_NNTPArticleList::AddArticleXOVER(LPSTR articleHeader)
{
    char             temp[1024];
    UT_ARTICLEINFOA  workArticle;

    if(articleHeader == NULL)
        return UTE_PARAMETER_INVALID_VALUE;

    int fieldCounter = 0; // Counter for the number of fields in the header tokened by a tab Or NULL

    // scan string for two tabs together and insert a space between
    // them.  This is necessary to ensure that the strtok function
    // will not get gummed up.  strtok will skip leading tokens in
    // a string, so if two tokens are side by side, the function,
    // will skip one of the parameters rather than returning a null
    // parameter.

    if(strstr(articleHeader, "\t\t") != NULL) {
        char buf[2500];
        int i = 0, j = 0;
        do {
            if (articleHeader[i] == '\t' && articleHeader[i+1] == '\t') {
                buf[j++]=articleHeader[i++];
                buf[j++]=' ';
                buf[j++]=articleHeader[i++];
                } 
            else 
                buf[j++]=articleHeader[i++];        
        } while (articleHeader[i] != '\0');
            
        buf[j] = '\0';
        strcpy(articleHeader, buf);
    }

	int nId = 0;  // by default
	int nSubject  = 0;
	int nXref = 0;
	int nFrom = 0;
	int nDate = 0;
	int nByte = 0;
	int nMsgId = 0;
	int nNumberOfLines = 0;
	int nRefrence = 0;
	int nTopic = 0;

	/* Updated Version 4.x 
	Each line of output will be formatted with the article number,
	followed by each of the headers in the overview database or the
	article itself (when the data is not available in the overview
	database) for that article separated by a tab character.  The
	sequence of fields must be in this order: subject, author, date,
	message-id, references, byte count, and line count.  Other optional
	fields may follow line count.  Other optional fields may follow line
	count.  These fields are specified by examining the response to the
	LIST OVERVIEW.FMT command.  Where no data exists, a null field must
	be provided (i.e. the output will have two tab characters adjacent to
	each other).  Servers should not output fields for articles that have
	been removed since the XOVER database was created.
								GW
	*/

	for (int loop = 0; loop < m_OverviewFormat.GetCount (); loop++ )
	{
		if (nSubject == 0 && stricmp(m_OverviewFormat.GetString (loop),"Subject") == 0)
		{
			nSubject = loop+1;

		}
		else if (nFrom == 0 && stricmp(m_OverviewFormat.GetString (loop),"From") == 0)
		{
			nFrom = loop+1;

		}
		else if (nDate == 0 && stricmp(m_OverviewFormat.GetString (loop),"Date") == 0)
		{
			nDate = loop+1;

		}
		else if (nXref == 0 && stricmp(m_OverviewFormat.GetString (loop),"Xref") == 0)
		{
			nXref = loop+1;

		}
		else if (nMsgId == 0 && stricmp(m_OverviewFormat.GetString (loop),"Message-Id") == 0)
		{
			nMsgId = loop+1;

		}
		else if (nByte == 0 && stricmp(m_OverviewFormat.GetString (loop),"Bytes") == 0)
		{
			nByte = loop+1;

		}
		else if (nNumberOfLines == 0 && stricmp(m_OverviewFormat.GetString (loop),"Lines") == 0)
		{
			nNumberOfLines = loop+1;

		}
		else if (nRefrence == 0 && stricmp(m_OverviewFormat.GetString (loop),"References") == 0)
		{
			nRefrence = loop+1;
		}			
		else if (nTopic == 0 && stricmp(m_OverviewFormat.GetString (loop),"Thread-Topic") == 0)
		{
			nTopic = loop+1;
		}			
	}
	// string operation is way too consuming 
    // Get the number of peices
    fieldCounter = CUT_StrMethods::GetParseStringPieces(articleHeader, "\t");

	CUT_StrMethods::ParseString (articleHeader,"\t",nId, &(workArticle.nArticleId));
	if (nSubject > 0)
	{
		CUT_StrMethods::ParseString (articleHeader,"\t",nSubject,temp,sizeof(temp)-1);
		workArticle.lpszSubject = new char[strlen(temp)+1];
        strcpy(workArticle.lpszSubject, temp);
	}     
	if (nFrom > 0)
	{
		CUT_StrMethods::ParseString (articleHeader,"\t",nFrom,temp,sizeof(temp)-1);
	    workArticle.lpszAuthor = new char[strlen(temp)+1];
		strcpy(workArticle.lpszAuthor, temp);   
	}
    // Date
		if (nDate > 0)
	{
		CUT_StrMethods::ParseString (articleHeader,"\t",nDate,temp,sizeof(temp)-1);
	    workArticle.lpszDate = new char[strlen(temp)+1];
		strcpy(workArticle.lpszDate, temp);   
	}

	if (nMsgId > 0)
	{
		CUT_StrMethods::ParseString (articleHeader,"\t",nMsgId,temp,sizeof(temp)-1);
	    workArticle.lpszMessageId = new char[strlen(temp)+1];
		strcpy(workArticle.lpszMessageId, temp);   
	}

	if (nRefrence > 0)
	{
		CUT_StrMethods::ParseString (articleHeader,"\t",nRefrence,temp,sizeof(temp)-1);
	    workArticle.lpszReferences = new char[strlen(temp)+1];
		strcpy(workArticle.lpszReferences, temp);   
	}
	// bytecount
	if (nByte > 0)
	{
		long numbByte = 0;
		CUT_StrMethods::ParseString (articleHeader,"\t",nByte, &numbByte );

		workArticle.nByteCount = (int )numbByte ;
	}	

	if (nNumberOfLines > 0)
	{
		long numbByte = 0;
		CUT_StrMethods::ParseString (articleHeader,"\t",nNumberOfLines, &numbByte );
		workArticle.nLineCount  = (int )numbByte ;
	}

    AddArticle(workArticle);
    
    return UTE_SUCCESS;
}