/** Split * @return void */ void PlainDictionary::split(ProgressListener *listener) { subjects.clear(); shared.clear(); objects.clear(); unsigned int total = hashSubject.size()+hashObject.size(); unsigned int count = 0; for(DictEntryIt subj_it = hashSubject.begin(); subj_it!=hashSubject.end() && subj_it->first; subj_it++) { //cout << "Check Subj: " << subj_it->first << endl; DictEntryIt other = hashObject.find(subj_it->first); if(other==hashObject.end()) { // Only subject subjects.push_back(subj_it->second); } else { // Exist in both shared.push_back(subj_it->second); } count++; NOTIFYCOND(listener, "Extracting shared subjects", count, total); } for(DictEntryIt obj_it = hashObject.begin(); obj_it!=hashObject.end(); ++obj_it) { //cout << "Check Obj: " << obj_it->first << endl; DictEntryIt other = hashSubject.find(obj_it->first); if(other==hashSubject.end()) { // Only object objects.push_back(obj_it->second); } count++; NOTIFYCOND(listener, "Extracting shared objects", count, total); } }
void PlainDictionary::save(std::ostream &output, ControlInformation &controlInformation, ProgressListener *listener) { controlInformation.setFormat(HDTVocabulary::DICTIONARY_TYPE_PLAIN); controlInformation.setUint("mapping", this->mapping); controlInformation.setUint("sizeStrings", this->sizeStrings); controlInformation.setUint("numEntries", this->getNumberOfElements()); controlInformation.save(output); unsigned int i = 0; unsigned int counter=0; const char marker = '\1'; //shared subjects-objects from subjects for (i = 0; i < shared.size(); i++) { output << shared[i]->str; output.put(marker); //character to split file counter++; NOTIFYCOND(listener, "PlainDictionary saving shared", counter, getNumberOfElements()); } output.put(marker); //extra line to set the begining of next part of dictionary //not shared subjects for (i = 0; i < subjects.size(); i++) { output << subjects[i]->str; output.put(marker); //character to split file counter++; NOTIFYCOND(listener, "PlainDictionary saving subjects", counter, getNumberOfElements()); } output.put(marker); //extra line to set the begining of next part of dictionary //not shared objects for (i = 0; i < objects.size(); i++) { output << objects[i]->str; output.put(marker); //character to split file counter++; NOTIFYCOND(listener, "PlainDictionary saving objects", counter, getNumberOfElements()); } output.put(marker); //extra line to set the begining of next part of dictionary //predicates for (i = 0; i < predicates.size(); i++) { output << predicates[i]->str; output.put(marker); //character to split file counter++; NOTIFYCOND(listener, "PlainDictionary saving predicates", counter, getNumberOfElements()); } output.put(marker); }
void PlainDictionary::load(std::istream & input, ControlInformation &ci, ProgressListener *listener) { std::string line; unsigned char region = 1; startProcessing(); std::string format = ci.getFormat(); if(format!=getType()) { throw "Trying to read a PlainDictionary but the data is not PlainDictionary"; } this->mapping = ci.getUint("mapping"); this->sizeStrings = ci.getUint("sizeStrings"); unsigned int numElements = ci.getUint("numEntries"); unsigned int numLine = 0; IntermediateListener iListener(listener); iListener.setRange(0,25); while(region<5 && getline(input, line,'\1')) { //std::cout << line << std::endl; if(line!="") { if (region == 1) { //shared SO NOTIFYCOND(&iListener, "Dictionary loading shared area.", numLine, numElements); insert(line, SHARED_SUBJECT); } else if (region == 2) { //not shared Subjects NOTIFYCOND(&iListener, "Dictionary loading subjects.", numLine, numElements); insert(line, NOT_SHARED_SUBJECT); NOTIFYCOND(&iListener, "Dictionary loading objects.", numLine, numElements); } else if (region == 3) { //not shared Objects insert(line, NOT_SHARED_OBJECT); NOTIFYCOND(&iListener, "Dictionary loading predicates.", numLine, numElements); } else if (region == 4) { //predicates insert(line, NOT_SHARED_PREDICATE); } } else { region++; } numLine++; } // No stopProcessing() Needed. Dictionary already split and sorted in file. updateIDs(); }
void CompactTriples::load(ModifiableTriples &triples, ProgressListener *listener) { triples.sort(order); IteratorTripleID *it = triples.searchAll(); vector<unsigned int> vectorY, vectorZ; unsigned int lastX, lastY, lastZ; unsigned int x, y, z; // First triple if(it->hasNext()) { TripleID *triple = it->next(); swapComponentOrder(triple, SPO, order); lastX = x = triple->getSubject(); lastY = y = triple->getPredicate(); lastZ = z = triple->getObject(); vectorY.push_back(y); vectorZ.push_back(z); numTriples++; } // Rest of the triples while(it->hasNext()) { TripleID *triple = it->next(); //cout << "111> " << triple << endl; swapComponentOrder(triple, SPO, order); //cout << "222> " << triple << endl; x = triple->getSubject(); y = triple->getPredicate(); z = triple->getObject(); if(x!=lastX) { vectorY.push_back(0); vectorY.push_back(y); vectorZ.push_back(0); vectorZ.push_back(z); } else if(y!=lastY) { vectorY.push_back(y); vectorZ.push_back(0); vectorZ.push_back(z); } else { vectorZ.push_back(z); } lastX = x; lastY = y; lastZ = z; NOTIFYCOND(listener, "Converting to CompactTriples.", numTriples, triples.getNumberOfElements()); numTriples++; } delete it; VectorUIntIterator itY(vectorY); VectorUIntIterator itZ(vectorZ); streamY->add(itY); streamZ->add(itZ); #if 0 // Debug Adjacency Lists cout << "Y" << vectorY.size() << "): "; for(unsigned int i=0;i<arrayY->getNumberOfElements();i++){ cout << arrayY->get(i) << " "; } cout << endl; cout << "Z" << vectorZ.size() << "): "; for(unsigned int i=0;i<arrayZ->getNumberOfElements();i++){ cout << arrayZ->get(i) << " "; } cout << endl; #endif }
CSD_PFC::CSD_PFC(hdt::IteratorUCharString *it, uint32_t blocksize, hdt::ProgressListener *listener) : isMapped(false) { this->type = PFC; this->numstrings = 0; this->bytes = 0; this->blocksize = blocksize; this->nblocks = 0; uint64_t reservedSize = 1024; text = (unsigned char*)malloc(reservedSize*sizeof(unsigned char)); // Pointers to the first string of each block. blocks = new hdt::LogSequence2(sizeof(size_t)==8 ? 34 : 32); unsigned char *currentStr = NULL; size_t currentLength = 0; string previousStr; while (it->hasNext()) { currentStr = it->next(); currentLength = strlen( (char*) currentStr); // Realloc size of the buffer if necessary. // +1 for string terminator +10 for VByte encoding (worst case) if ((bytes+currentLength+11) > reservedSize) { reservedSize = (bytes+currentLength+10)*2; text = (unsigned char*)realloc(text, reservedSize*sizeof(unsigned char)); } if ((numstrings % blocksize) == 0) { // First string in the current block! blocks->push_back(bytes); nblocks++; // The string is explicitly copied to the encoded sequence. strncpy((char*)(text+bytes), (char*)currentStr, currentLength); bytes+=currentLength; } else { // Regular string // Calculate the length of the common prefix unsigned int delta = longest_common_prefix((unsigned char *)previousStr.c_str(), currentStr, previousStr.length(), currentLength); // The prefix is differentially encoded bytes += VByte::encode(text+bytes, delta); // The suffix is copied to the sequence strncpy((char*)(text+bytes), (char*)currentStr+delta, currentLength-delta); bytes+=currentLength-delta; } text[bytes] = '\0'; bytes++; // New string processed numstrings++; // Save previous previousStr.assign((char*)currentStr); NOTIFYCOND(listener, "Converting dictionary to PFC", numstrings, it->getNumberOfElements()); it->freeStr(currentStr); } // Storing the final byte position in the vector of positions blocks->push_back(bytes); // Trunc encoded sequence to save unused memory text = (unsigned char *) realloc(text, bytes*sizeof(unsigned char)); blocks->reduceBits(); }