Esempio n. 1
0
bool IndexerImpl::incorporate(string url, WordBag& wb)
{
    // adding to urlToId map, adding to idToUrl map
    int* temp = urlToId.find(url);
    if (temp != nullptr) return false;
    
    string word;
    int retrieveCount;
    bool getCount = wb.getFirstWord(word, retrieveCount);
    if (getCount == false) return false;
    
    int ID = rand() %100000;
    while (idToUrl.find(ID) != nullptr) { ID = rand() %100000; }
    urlToId.associate(url, ID); // add to urlToId map
    idToUrl.associate(ID, url); // add to idToUrl map
    
    while (getCount) {
        if (wordToIdCounts.find(word) != nullptr) {
            vector<Pair> *x = wordToIdCounts.find(word);
            x->push_back(Pair(ID, retrieveCount));
        }
        else {
            vector<Pair> y;
            y.push_back(Pair(ID, retrieveCount));
            wordToIdCounts.associate(word, y);
        }
        getCount = wb.getNextWord(word, retrieveCount);
    }
    return true;
//    The user calls the incorporate() method to add all the words in the provided WordBag argument to the index. If incorporate() has previously been called with the same url argument, it returns false, doing nothing. Otherwise, it updates the index by adding all the words and returns true. Incorporating a WordBag containing W distinct words to an index that already contains N words must take far less than O(WN) time, because adding one word mapping (e.g., "fun" → { "www.b.com", 1 }) to an index that already contains N words must take far less than O(N) time.
}
Esempio n. 2
0
bool IndexerImpl::load(string filenameBase)
{
    //Deletes existing map of url to count
    clear();
    urlToCount = new MyMap<string, vector<point> >;
    ifstream infile(filenameBase+".txt");
    if ( ! infile )
	{
	    cerr << "Error: Cannot open data.txt!" << endl;
	    return false;
	}
    int urlToCountsize;
    infile >> urlToCountsize;
    infile.ignore(10000, '\n');
    string tempword;
    int vectorsize;
    for (int i=0;i<urlToCountsize;i++)
    {
        infile >> tempword;
        infile >> vectorsize;
        infile.ignore(10000, '\n');
        vector<point> tempvec;
        for (int j=0;j<vectorsize;j++)
        {
            int hash, count;
            infile >> hash;
            infile >> count;
            infile.ignore(10000, '\n');
            tempvec.push_back(point(hash,count));
        }
        urlToCount->associate(tempword, tempvec);
    }
    int idToUrlsize;
    infile >> idToUrlsize;
    infile.ignore(10000, '\n');
    for (int j=0;j<idToUrlsize;j++)
    {
        int index, count;
        infile >> index;
        infile >> count;
        infile.ignore(10000, '\n');
        for (int i=0;i<count;i++)
        {
            int key;
            string url;
            infile >> key;
            infile >> url;
            infile.ignore(10000, '\n');
            if (idToUrl[index]==NULL)
                idToUrl[index]=new MyMap<int, string>;
            idToUrl[index]->associate(key, url);
        }
    }
    return true;
}
Esempio n. 3
0
bool loadMyMap(string filename, MyMap<KeyType, ValueType>& m)
{
    m.clear();
    ifstream stream("filename");
    if (!stream) return false;
    int size = m.size();
    if (!readItem(stream, size)) return false; // Read the number of associations in m from stream, returning false if we can't
    KeyType x;
    ValueType y;
    for (int i = 0; i < size; i++) {
        if (!readItem(stream, x)) return false;
        stream.ignore(10000, '\n');
        if (!readItem(stream, y)) return false;
        stream.ignore(10000, '\n');
        m.associate(x, y);
    }
    return true;
}
Esempio n. 4
0
bool IndexerImpl::incorporate(string url, WordBag& wb)
{

    int hash;
    //Generates a hash of url as well as index for linear probing
    int modhash = hashurl(url,hash);
    //If the head bucket of the array is not initialized, initialize it
    if (idToUrl[modhash]==NULL)
        idToUrl[modhash]=new MyMap<int,string>;
    //Checks to see if this url has already been passed to incorporate
    if (idToUrl[modhash]->find(hash)!=NULL)
        return false;
    //Adds url to hash-url map
    if (idToUrl[modhash]->size()==0)
        indexes.push_back(modhash);
    idToUrl[modhash]->associate(hash,url);
    
    
    string word;
    int count;
    //Iterates through W distinct words in wordbag. O(W)
    bool gotAWord = wb.getFirstWord(word, count);
    while (gotAWord)
    {
        //Accounts for case-insensitive indexing.
        strToLower(word);
        point bucket = point(hash,count);
        //Find if word already exists. O(log N).
        vector<point>* temp = urlToCount->find(word);
        if (temp==NULL)
        {
            //The word does not yet exist in the index
            //Create a temp vector
            temp = new vector<point>;
        }
        temp->push_back(bucket);
        //Map word to temp Map of urlhash to wordcount
        urlToCount->associate(word, *temp);
        gotAWord = wb.getNextWord(word,count);
    }
    //Final BigO is O(WlogN) which is less than O(WN)
    return true;
}