void coin::WordCounter::write_frequencies(const string& file, int minimum) const{ ofstream out(file, ios::out|ios::binary|ios::trunc); double total = saving_->total(); out.write((char *)&total, sizeof(double)); Bucket<string> *bucket = saving_->least_bucket(); int64_t counter = 0; do{ bucket = bucket->prev(); if(bucket->count() < minimum){ bucket = bucket->next(); break; } Element<string> *child = bucket->child(); map<string, Element<string>*> children; do{ children[child->item()] = child; child = child->next(); ++counter; }while(child != bucket->child()); int childs = children.size(); out.write((char *)&childs, sizeof(int)); double count = bucket->count(); out.write((char *)&count, sizeof(double)); for(pair<string, Element<string>*> child:children){ short size = child.first.size(); out.write((char *)&size, sizeof(short)); out.write(child.first.c_str(), sizeof(char) * size); } if(counter > saving_->max_count()){ break; } }while(bucket != saving_->least_bucket()); for(pair<string, double> dict_count:dict_counts_){ int childs = 1; out.write((char *)&childs, sizeof(int)); out.write((char *)&dict_count.second, sizeof(double)); short size = dict_count.first.size(); out.write((char *)&size, sizeof(short)); out.write(dict_count.first.c_str(), sizeof(char) * size); } out.close(); cerr << counter << " words with at least " << bucket->count() << " frequencies are written" << endl; }
void func() { // test when file does not exist { bool ok = false; try { StatFile sf("doesnotexist.sf", Stats(), mm); } catch (std::runtime_error const &re) { ok = true; } assert_true(ok); } setUp(); // test mapTimeToBucketIndex() and mapBucketIndexToFileIndex { StatFile sf("created.sf", Stats(), mm); assert_equal(2, sf.lastBucket()); assert_equal(0, sf.mapTimeToBucketIndex(1000000000)); assert_equal(0, sf.mapTimeToBucketIndex(1000000001)); assert_equal(0, sf.mapBucketIndexToFileIndex(0)); assert_equal(1, sf.mapTimeToBucketIndex(1000000010)); assert_equal(1, sf.mapBucketIndexToFileIndex(1)); assert_equal(2, sf.mapTimeToBucketIndex(1000000020)); assert_equal(2, sf.mapBucketIndexToFileIndex(2)); int64_t calculated_first_bucket_time = 1000000020 - (10240-1)*10; assert_equal(-10237, sf.mapTimeToBucketIndex(calculated_first_bucket_time)); assert_equal(3, sf.mapBucketIndexToFileIndex(-10237)); // go backwards across the wrap boundry assert_equal(-1, sf.mapTimeToBucketIndex(1000000000-10)); assert_equal(10239, sf.mapBucketIndexToFileIndex(-1)); // go forwards twice across the wrap boundry assert_equal(10241, sf.mapTimeToBucketIndex(1000000020+(10239*10))); assert_equal(-1, sf.mapBucketIndexToFileIndex(10241)); // go forwards twice across the wrap boundry assert_equal(20481, sf.mapTimeToBucketIndex(1000000020+(10240*10)+(10239*10))); assert_equal(-1, sf.mapBucketIndexToFileIndex(20481)); // test round_up assert_equal(1, sf.mapTimeToBucketIndex(1000000001, true)); assert_equal(1, sf.mapBucketIndexToFileIndex(1)); } mm->dispose(); mm = NewMmap(); // test numBucketsBetween() { StatFile sf("created.sf", Stats(), mm); assert_equal(2, sf.lastBucket()); assert_equal(1, sf.numBucketsBetween(1000000000, 1000000000)); assert_equal(1, sf.numBucketsBetween(1000000000, 1000000001)); assert_equal(2, sf.numBucketsBetween(1000000001, 1000000011, true)); assert_equal(1, sf.numBucketsBetween(1000000000, 1000000010)); } mm->dispose(); mm = NewMmap(); { StatFile sf("created.sf", Stats(), mm); assert_equal(2, sf.lastBucket()); Bucket const &b = sf.bucket(0); assert_equal(b.count(), 3); assert_equal(b.min(), 4.0); assert_equal(b.max(), 5.0); assert_equal(b.sum(), 14.0); assert_equal(b.sumSq(), 66); assert_equal(b.time(), 1000000000); Bucket const &c = sf.bucket(1); assert_equal(c.count(), 2); assert_equal(c.sumSq(), 16+25); assert_equal(c.time(), 1000000010); assert_equal(sf.settings().intervalTime, 10); ssize_t bpp = 8192 / sizeof(Bucket); ssize_t n = 10000; int ns = 0; while (n > 0) { ns = ns + bpp; n = n - bpp; } assert_equal(sf.settings().numSamples, ns); assert_equal(sf.settings().unit[0], 0); } mm->dispose(); mm = NewMmap(); // test that pre-fetching is turned off { WrapMmap wm(mm); StatFile sf("created.sf", Stats(), &wm); assert_equal(2, sf.lastBucket()); Bucket const &b = sf.bucket(2); assert_equal(b.count(), 3); assert_equal(wm.open_, 1); assert_equal(wm.close_, 0); assert_equal(wm.map_, 2); // file header, first page assert_equal(wm.unmap_, 0); assert_equal(wm.lastOffsetMapped_, 8192); // page size size_t n = 8192 / sizeof(Bucket); for (size_t i = 3; i < n-1; ++i) { Bucket c(10, 50, 5, 5, 2, 1000000000 + 10 * i); sf.updateBucket(c); } assert_equal(wm.map_, 2); assert_equal(wm.lastOffsetMapped_, 8192); { Bucket c(10, 50, 5, 5, 2, 1000000000 + 10 * n); sf.updateBucket(c); } // fetched new now that we hit the next bucket assert_equal(wm.map_, 3); assert_equal(wm.lastOffsetMapped_, 8192 * 2); int64_t no, nc, nm, nu; mm->counters(&nm, &nu, &no, &nc); assert_equal(nm, wm.map_); assert_equal(nu, wm.unmap_); assert_equal(no, wm.open_); assert_equal(nc, wm.close_); } // test that we deal with intermediate empty buckets { setUp(); StatFile sf("created.sf", Stats(), mm); Bucket b(123, 456, 3323, 4, 3, 1000000000 + 100); sf.updateBucket(b); for (int i = 0; i < 9; ++i) { Bucket const &c = sf.bucket(i); if (c.time() != 0) { assert_equal(c.time(), 1000000000 + i * 10); } } Bucket const &c = sf.bucket(6); assert_equal(c.min(), 0); Bucket const &d = sf.bucket(10); assert_equal(d.min(), 3323); } // test seasonal buckets { unlink("season.sf"); StatFile::Settings settings; settings.zeroTime = 1000000000; settings.intervalTime = 10; settings.numSamples = 8640; settings.flags = FILE_FLAG_IS_TRAILING; settings.season = 86400; settings.lambda = 0.5; StatFile sf("season.sf", Stats(), settings, mm, true); { Bucket b1(2, 4, 2, 2, 1, 1000100000); sf.updateBucket(b1); sf.updateBucket(Bucket(b1, 1000100100)); sf.updateBucket(Bucket(b1, 1000100110)); sf.updateBucket(Bucket(b1, 1000100120)); sf.updateBucket(Bucket(b1, 1000100130)); sf.updateBucket(Bucket(b1, 1000100140)); sf.updateBucket(Bucket(b1, 1000110100)); sf.updateBucket(Bucket(b1, 1000110110)); sf.updateBucket(Bucket(b1, 1000110120)); sf.updateBucket(Bucket(b1, 1000110130)); sf.updateBucket(Bucket(b1, 1000110140)); } { // 86400 after the previous write -- one season Bucket b2(0, 0, 0, 0, 1, 1000196500); sf.updateBucket(b2); sf.updateBucket(Bucket(b2,1000196530)); sf.updateBucket(Bucket(b2,1000196540)); sf.updateBucket(Bucket(b2,1000196550)); } int64_t ix = sf.mapTimeToBucketIndex(1000196500); Bucket bOut; int64_t r = sf.readBuckets(&bOut, 1, ix); assert_equal(r, 1); assert_equal(bOut.count(), 1); assert_equal(bOut.min(), 0.5 * 2); assert_equal(bOut.max(), 0.5 * 2); assert_equal(bOut.sum(), 0.5 * 2); assert_equal(bOut.sumSq(), 0.5 * 4); sf.flush(); } //test time_to_bucket_index returns -1 for time older than bucket range { setUp(); StatFile sf("created.sf", Stats(), mm); StatFile::Settings const &settings = sf.settings(); Bucket const &last_bucket = sf.bucket(sf.lastBucket()); time_t last_time = last_bucket.time(); time_t old_time = last_time - (settings.numSamples+1) * settings.intervalTime; int64_t old_index = sf.mapTimeToBucketIndex(old_time, true); assert_equal(-10239, old_index); assert_equal(-1, sf.mapBucketIndexToFileIndex(old_index)); } //test firstBucketTime and lastBucketTime { setUp(); StatFile sf("created.sf", Stats(), mm); assert_equal(1000000020, sf.lastBucketTime()); assert_equal(1000000020 - (10240-1)*10, sf.firstBucketTime()); assert_equal(1000000000, sf.bucket(sf.firstBucket()).time()); } }
bool StatFile::rawUpdateBucket(Bucket const &data, RawUpdateMode mode) { if (!fileWritable_) { throw std::logic_error(std::string("Attempt to update bucket in read-only StatFile ") + fileHeader_->name); } // Work out which bucket to put the data in int64_t targetBucketIndex = mapTimeToBucketIndex(data.time(), false); int64_t targetBucketTime = data.time() - data.time() % fileHeader_->cfg_interval; int64_t latestBucketIndex = fileHeader_->last_bucket; Bucket *bp = 0; if (targetBucketIndex > latestBucketIndex) { // Note: I don't clear the bucket here. It will contain // old data. The user will have to detect this and discard // the data instead. fileHeader_->last_bucket = targetBucketIndex; fileHeader_->last_time = targetBucketTime; if (fileHeader_->first_bucket + bucketCount_ <= fileHeader_->last_bucket) { fileHeader_->first_bucket = fileHeader_->last_bucket - bucketCount_ + 1; } } if (!StatFile::isBucketIndexInFile(targetBucketIndex)) { Log(LL_Warning, "libistat") << "Cannot go back to time" << data.time() << "when at" << fileHeader_->last_time << ":" << fileHeader_->name; return false; } if (targetBucketIndex < fileHeader_->first_bucket) { fileHeader_->first_bucket = targetBucketIndex; } if (fileHeader_->flags & FILE_FLAG_IS_TRAILING) { bp = getTrailingBucket(targetBucketTime); } else { bp = writableBucket(targetBucketIndex); if (bp->time() != targetBucketTime) { memset(bp, 0, sizeof(*bp)); } } if((fileHeader_->flags & FILE_FLAG_IS_COLLATED) || mode == RAWUP_OVERWRITE || (mode == RAWUP_FILL_EMPTY && bp->count() == 0)) { memcpy(bp, &data, sizeof(*bp)); } else { Bucket timeData(data, targetBucketTime); bp->update(timeData); if(fileHeader_->flags & FILE_FLAG_IS_COUNTER_AGGREGATE) { bp->setCount(fileHeader_->fixed_count); } } fileHeader_->cumulative_sum += data.sum(); fileHeader_->cumulative_sum_sq += data.sumSq(); fileHeader_->cumulative_count += data.count(); return true; }