示例#1
0
bool HistorySearch::search(int startColumn, int startLine, int endColumn, int endLine) {
    qDebug() << "search from" << startColumn << "," << startLine
            <<  "to" << endColumn << "," << endLine;

    int linesRead = 0;
    int linesToRead = endLine - startLine + 1;

    qDebug() << "linesToRead:" << linesToRead;

    // We read process history from (and including) startLine to (and including) endLine in
    // blocks of at most 10K lines so that we do not use unhealthy amounts of memory
    int blockSize;
    while ((blockSize = std::min(10000, linesToRead - linesRead)) > 0) {

        QString string;
        QTextStream searchStream(&string);
        PlainTextDecoder decoder;
        decoder.begin(&searchStream);
        decoder.setRecordLinePositions(true);

        // Calculate lines to read and read them
        int blockStartLine = m_forwards ? startLine + linesRead : endLine - linesRead - blockSize + 1;
        int chunkEndLine = blockStartLine + blockSize - 1;
        m_emulation->writeToStream(&decoder, blockStartLine, chunkEndLine);

        // We search between startColumn in the first line of the string and endColumn in the last
        // line of the string. First we calculate the position (in the string) of endColumn in the
        // last line of the string
        int endPosition;

        // The String that Emulator.writeToStream produces has a newline at the end, and so ends with an
        // empty line - we ignore that.
        int numberOfLinesInString = decoder.linePositions().size() - 1;
        if (numberOfLinesInString > 0 && endColumn > -1 )
        {
            endPosition = decoder.linePositions().at(numberOfLinesInString - 1) + endColumn;
        }
        else
        {
            endPosition = string.size();
        }

        // So now we can log for m_regExp in the string between startColumn and endPosition
        int matchStart;
        if (m_forwards)
        {
            matchStart = string.indexOf(m_regExp, startColumn);
            if (matchStart >= endPosition)
                matchStart = -1;
        }
        else
        {
            matchStart = string.lastIndexOf(m_regExp, endPosition - 1);
            if (matchStart < startColumn)
                matchStart = -1;
        }

        if (matchStart > -1)
        {
            int matchEnd = matchStart + m_regExp.matchedLength() - 1;
            qDebug() << "Found in string from" << matchStart << "to" << matchEnd;

            // Translate startPos and endPos to startColum, startLine, endColumn and endLine in history.
            int startLineNumberInString = findLineNumberInString(decoder.linePositions(), matchStart);
            m_foundStartColumn = matchStart - decoder.linePositions().at(startLineNumberInString);
            m_foundStartLine = startLineNumberInString + startLine + linesRead;

            int endLineNumberInString = findLineNumberInString(decoder.linePositions(), matchEnd);
            m_foundEndColumn = matchEnd - decoder.linePositions().at(endLineNumberInString);
            m_foundEndLine = endLineNumberInString + startLine + linesRead;

            qDebug() << "m_foundStartColumn" << m_foundStartColumn
                    << "m_foundStartLine" << m_foundEndLine
                    << "m_foundEndColumn" << m_foundEndColumn
                    << "m_foundEndLine" << m_foundEndLine;

            return true;
        }


        linesRead += blockSize;
    }

    qDebug() << "Not found";
    return false;
}
vector<Page*>* QueryProcessor::process(int structure, string search)
{
    bool is_not = false;
    int not_pos = 0;
    unsigned long length = 0;
    vector<Page*> hold;
    unordered_set<string>::const_iterator probe;
    
    istringstream searchStream(search);
    
    vector<string> tokens{istream_iterator<string>{searchStream},
        istream_iterator<string>{}};
    
    //Clear Results(reusable funct.)
    processedResults->clear();
    rawResults->clear();

    //Remove Stop Words
    for (int i = 0; i < tokens.size(); ++i)
    {
        probe = stopWords->find(tokens[i]);
        if(probe != stopWords->end())
        {
            tokens.erase(tokens.begin()+i);
        }
    }
    
    //Checking for NOT, Lowercase, & Stem
    for (int i = 0; i < tokens.size(); ++i)
    {
        //cout << tokens[i] << endl;
        if (tokens[i] == "NOT")
        {
            is_not = true;
            not_pos = i;
        }
        
        transform(tokens[i].begin(), tokens[i].end(),
                  tokens[i].begin(), ::tolower);
        
        Porter2Stemmer::stem(tokens[i]);
        
    }
    
    //Length of Search Terms
    if(is_not)
    {
        length = tokens.size() - (tokens.size()-not_pos) - 1;
    }
    else if (tokens.size() > 1)
    {
        length = tokens.size()-1;
    }
    else if (tokens.size() == 1)
    {
        length = 0;
        rawResults->push_back(index->getListOfPages(structure, tokens[0]));
    }
    
    //Length > 0, add to results
    for(int i = 0; i < length; ++i)
    {
        rawResults->push_back(index->getListOfPages(structure, tokens[i+1]));
    }
    
    //Single Search Term
    if(length == 0)
    {
        rawResults->push_back(index->getListOfPages(structure, tokens[0]));
        length = 1;
    }
    
    //AND...
    if( (tokens[0].compare("AND") == 0 || tokens[0].compare("and") == 0)
       && length >= 2 )
    {
        size_t size = 0;
        vector<Page*> temp;
        
        set_intersection(rawResults->operator[](0).begin(), rawResults->operator[](0).end(),
                         rawResults->operator[](1).begin(), rawResults->operator[](1).end(),
                         back_inserter(temp));
        
        if(length > 2)
        {
            for (int i = 2; i < length; ++i)
            {
                size = temp.size();
                
                set_intersection(temp.begin(), temp.end(),
                                 rawResults->operator[](i).begin(), rawResults->operator[](i).end(),
                                 back_inserter(temp));
                
                vector<Page*> vec(temp.begin()+size, temp.begin()+temp.size());
                
                sort(temp.begin(), temp.end());
                
                *processedResults = vec;
            }
        }
        else if(length == 2)
        {
            *processedResults = temp;
        }
        
    }
    //OR...
    else if(( tokens[0].compare("OR") == 0 || tokens[0].compare("or") == 0 ) && length >= 2)
    {
        vector<Page*> temp = rawResults->operator[](0);
        for (int i = 1; i < length; ++i)
        {
            temp = merge_copy(temp, rawResults->operator[](i));
            sort( temp.begin(), temp.end() );
            temp.erase( unique( temp.begin(), temp.end() ), temp.end() );
            
        }
        
        *processedResults = temp;
        
    }
    //Single Term
    else
    {
        *processedResults = rawResults->operator[](0);
    }
    
    //NOT...
    if(is_not && tokens.size() >= 3)
    {
        size_t size = 0;
        int notLength = 0;
        vector<Page*> temp;
        
        for(int i = not_pos + 1; i < tokens.size(); ++i, ++notLength)
        {
            rawResults->push_back(index->getListOfPages(structure, tokens[i]));
        }
        
        set_difference(processedResults->begin(), processedResults->end(),
                       rawResults->operator[](length).begin(),
                       rawResults->operator[](length).end(),
                       back_inserter(temp) );
        
        if(notLength > 1)
        {
            for (int i = 1; i < notLength; ++i)
            {
                size = (size_t)temp.size();
                
                set_difference(temp.begin(), temp.end(),
                               rawResults->operator[](length+i).begin(),
                               rawResults->operator[](length+i).end(),
                               back_inserter(temp));
                
                vector<Page*> vec(temp.begin()+size, temp.begin()+temp.size());
                temp = vec;
                sort(temp.begin(), temp.end());
                
                *processedResults = vec;
            }
            
        }
        else
        {
            *processedResults = temp;
        }
        
    }
    
    //Calculate Relevancy
    relevancy->clear();
    relevancy->reserve(processedResults->size());
    
    for(int i = 0; i < processedResults->size(); ++i)
    {
        int temp = 0;
        
        if(tokens[0] == "and" || tokens[0] == "or")
        {
            for (int j = 0; j < length ; ++j)
            {
                temp += processedResults->operator[](i)->getCount(tokens[j+1]);
            }
            
        }
        else
        {
            temp += processedResults->operator[](i)->getCount(tokens[0]);
        }
        
        relevancy->operator[](i) = temp;
        
    }
    
    //Sorting Results(relevancy)
	int pos_min,temp;
    Page* temp2;
    
    if (processedResults->size() > 1)
    {
        //Selection Sort: http://cforbeginners.com/ssort.html
        for (int i=0; i < processedResults->size()-1; i++)
        {
            pos_min = i;//set pos_min to the current index of array
            
            for (int j=i+1; j < processedResults->size(); j++)
            {
                
                if (relevancy->operator[](j) > relevancy->operator[](pos_min))
                    pos_min=j;
                //pos_min will keep track of the index that min is in, this is needed when a swap happens
            }
            
            //if pos_min no longer equals i than a smaller value must have been found, so a swap must occur
            if (pos_min != i)
            {
                temp = relevancy->operator[](i);
                temp2 = processedResults->operator[](i);
                relevancy->operator[](i) = relevancy->operator[](pos_min);
                processedResults->operator[](i) = processedResults->operator[](pos_min);
                relevancy->operator[](pos_min) = temp;
                processedResults->operator[](pos_min) = temp2;
            }
            
        }
        
    }
    
    return processedResults;
}