long long stem(const wstring &wrd) { long long h = phash(wrd); if(stemmed.count(h)) { return stemmed[h]; } return stemmed[h] = phash(sstem(wrd)); }
std::ostream& operator<<(std::ostream& os, const CImagePatch& patch) { os << "Patch:\n"; os << "\tFrame:\n\t\t" << patch.GetFrame() << std::endl; os << "\tBlur value:\n\t\t" << patch.GetBlurValue() << std::endl; os << "\tStandart deviation:\n\t\t" << patch.GetStandartDeviation() << std::endl; os << "\tGrey image:\n" << patch.GrayImage() << std::endl; os << "\tBin image:\n" << patch.BinImage() << std::endl; std::bitset<sizeof(uint64) * 8> phash(patch.GetPHash()); os << "\tPHash:\n\t\t" << phash << std::endl; std::bitset<sizeof(uint64) * 8> avgHash(patch.GetAvgHash()); os << "\tAvgHash:\n\t\t" << avgHash << std::endl; return os; }
void storage__append_block(struct storage__file* c, unsigned char* buf) { int i; unsigned char hash = phash(buf, c->block_size); for(i=c->depscount-1; i>=0; --i) { unsigned char hc = storage__get_block_hash(c->deps[i], c->current_block); if(hc==hash) { int ret = storage__read_block_nonrecursive(c->deps[i], c->outbuf, c->current_block); if(ret!=0){ ++c->writestat_dblrefs; continue; } if(!memcmp(c->outbuf, buf, c->block_size)) { storage__append_block_dep(c, i+1, hash); ++c->writestat_reused; return; } else { ++c->writestat_hashcoll; } } } if(hash==0) { // maybe the entire block is zero? int j; for (j=c->block_size; j>=0; --j) { if(buf[j])break; } if(j==-1) { // the block is zero storage__append_block_dep(c, -0x8000, 0); ++c->writestat_zero; return; } } storage__append_block_simple(c, buf, hash); }
// <hash, <start, len> > vector<pair<long long, pair<int, int> > > splitWords(const wstring &s, vector<pair<long long, long long> > &fixedstem, vector<pair<long long, long long> > &replaced, set<long long> &names) { vector<pair<long long, pair<int, int> > > ans; wstring word; int prevKind = 0; // 1 - letter, 2 - digit wstring S = s + L' '; for(int j=0; j<(int)S.size(); j++) { wchar_t i = towupper(S[j]); if(isLetter(i) && prevKind != 2) { word.push_back(i); prevKind = 1; } else if(isDigit(i) && prevKind != 1) { word.push_back('0'); prevKind = 2; } else { if(word.length()) { bool st = 1; long long pw = phash(word); for(auto &t : fixedstem) { if(t.first == pw) { pw = t.second; // wcerr << pw << L" proc\n"; st = 0; break; } } if(names.count(pw)) { ans.push_back({phname, {j-word.length(), word.length()}}); } else { long long std = 0; if(st) { std = stem(word); for(auto &t : replaced) { if(std == t.first) { std = t.second; break; } } } ans.push_back({st ? std : pw, {j-word.length(), word.length()}}); } } word.clear(); prevKind = 0; if(isLetter(i)) { word.push_back(i); prevKind = 1; } else if(isDigit(i)) { word.push_back('0'); prevKind = 2; } } } return ans; }
{ return i == L'а' || i == L'я' || i == L'о' || i == L'у' || i == L'ю' || i == L'и' || i == L'е'; } long long phash(const wstring &s) { long long ans = 0; for(auto i: s) { ans *= 1000000007LL; ans += i; } return ans; } long long phname = phash(L"firstname"); // <hash, <start, len> > vector<pair<long long, pair<int, int> > > splitWords(const wstring &s, vector<pair<long long, long long> > &fixedstem, vector<pair<long long, long long> > &replaced, set<long long> &names) { vector<pair<long long, pair<int, int> > > ans; wstring word; int prevKind = 0; // 1 - letter, 2 - digit wstring S = s + L' '; for(int j=0; j<(int)S.size(); j++) { wchar_t i = towupper(S[j]); if(isLetter(i) && prevKind != 2) { word.push_back(i); prevKind = 1;
int main(int argc, char* argv[]) { if(argc<2 || !strcmp(argv[1], "--help")) { fprintf(stderr, "Usage:\n" " fsfs-debug print-index block_size blockgroup_size file.idx [bgstart [bgcount]]\n" " fsfs-debug comp-stats blockgroup_size file.idx\n" " fsfs-debug decompress-block file.dat offset compressed_size > output\n" " fsfs-debug decompress-block2 < input > output\n" " fsfs-debug compress-block < input > output\n" " fsfs-debug compress-block2 < input > output\n" " fsfs-debug calculate-hash < input\n" " fsfs-debug get-length dir name\n" " fsfs-debug read-one-block dir name blocknum\n" ); return 1; } if(!strcmp(argv[1], "print-index")) { int bgsize=1020; int block_size=4096; long long int bgstart = 0; long long int bgcount = -1; assert(argc>=5 && argc<=7); sscanf(argv[2], "%d", &block_size); sscanf(argv[3], "%d", &bgsize); const char* idxfile = argv[4]; if(argc>=6) sscanf(argv[5], "%lld", &bgstart); if(argc>=7) sscanf(argv[6], "%lld", &bgcount); int bglen = bgsize * 2 + 8; FILE* idx = stdin; if(strcmp(argv[2], "-")) idx = fopen(idxfile, "rb"); assert(idx!=NULL); signed short int q; long long int baseoffset; if(bgstart!=0) { int ret = fseek(idx, bgstart*bglen, SEEK_SET); if(ret) { perror("fseek"); return 2;} } while(bgcount!=0) { int ret = fread(&baseoffset, 1, 8, idx); if(ret==0)break; if(ret!=8) { printf("Trimmed index file\n"); return 2; } baseoffset = be64toh(baseoffset); printf("Block group %lld, base offset: 0x%016llX\n", bgstart, baseoffset); int i; int accum = 0; for(i=0; i<bgsize; ++i) { int ret = fread(&q, 1, 2, idx); if(ret!=2) { printf("Trimmed index file\n"); return 2; } q = be16toh(q); printf("block %lld: ", i + bgstart*bgsize); if(q==-0x8000) { printf("zero\n"); } else if(q==-0x7FFF) { printf("uncompressed (%d bytes) at 0x%016llX\n", block_size, baseoffset+accum); accum+=block_size; } else if(q==0) { printf("unallocated\n"); } else if(q>0 && q<0x4444) { printf("compressed (%d bytes) at 0x%016llX\n", q, baseoffset+accum); accum+=q; } else if (q<0 && q >= -64) { printf("reference to %d's dependency\n", (-q)-1); } else { printf("probably invalid (%04X)\n", q); } } fflush(stdout); --bgcount; ++bgstart; } return 0; } else if(!strcmp(argv[1], "comp-stats")) { int bgsize=1020; assert(argc==4); sscanf(argv[2], "%d", &bgsize); const char* idxfile = argv[3]; FILE* idx = stdin; if(strcmp(argv[2], "-")) idx = fopen(idxfile, "rb"); assert(idx!=NULL); long long int baseoffset; signed short int q; unsigned long long int *stats = (unsigned long long int*) malloc(8*32768); unsigned long long int zeroes = 0; unsigned long long int invals = 0; unsigned long long int refs[64]; unsigned long long int total = 0; unsigned long long int uncompressibles = 0; memset(&refs, 0, sizeof(refs)); memset(stats, 0, 8*32768); int i; int trailing_zero_counter=0; for(;;) { int ret = fread(&baseoffset, 1, 8, idx); if(ret!=8)break; baseoffset = be64toh(baseoffset); for(i=0; i<bgsize; ++i) { int ret = fread(&q, 1, 2, idx); if(ret!=2) return 2; q = be16toh(q); if(q>0) { ++stats[q]; }else if(q==-0x7FFF) { ++uncompressibles; }else if(q==-0x8000) { ++zeroes; }else if(q==0) { ++trailing_zero_counter; }else if(q<0 && q>=-64) { ++refs[(-q)-1]; }else{ ++invals; } ++total; } } total-=trailing_zero_counter; long long int running = 0; printf("total: %lld (100%%) ; 0%% \n", total); running+=zeroes; if(zeroes>0)printf("zero: %lld (%g%%) ; %g%%\n", zeroes, 100.0*zeroes/total, 100.0*running/total); for(i=0; i<64; ++i) { running+=refs[i]; if(refs[i]>0)printf("refs[%d]: %lld (%g%%) ; %g%%\n", i, refs[i], 100.0*refs[i]/total, 100.0*running/total); } for(i=0; i<32768; ++i) { running+=stats[i]; if(stats[i]>0)printf("compressed[%d]: %lld (%g%%) ; %g%%\n", i, stats[i], 100.0*stats[i]/total, 100.0*running/total); } running+=uncompressibles; if(uncompressibles>0)printf("uncompressible: %lld (%g%%) ; %g%%\n", uncompressibles, 100.0*uncompressibles/total, 100.0*running/total); running+=invals; if(invals>0)printf("invalid: %lld (%g%%) ; %g%%\n", invals, 100.0*invals/total, 100.0*running/total); free(stats); return 0; } else if(!strcmp(argv[1], "decompress-block")) { assert(argc==5); const char* datfile = argv[2]; long long int offset = 0; int size; sscanf(argv[3], "%lld", &offset); sscanf(argv[4], "%d", &size); FILE* dat = fopen(datfile, "rb"); int ret = fseek(dat, offset, SEEK_SET); assert(ret==0); assert(size<65536); unsigned char chunk[65536]; unsigned char chunk2[65536+2048]; ret = fread(&chunk, 1, size, dat); assert(ret==size); fclose(dat); lzo_uint len = 65536+2048; lzo1x_decompress_safe(chunk, ret, chunk2, &len, NULL); fwrite(chunk2, 1, len, stdout); return 0; } else if(!strcmp(argv[1], "decompress-block2")) { unsigned char chunk[65536]; unsigned char chunk2[65536+2048]; int ret = fread(&chunk, 1, 65536, stdin); lzo_uint len = 65536+2048; lzo1x_decompress_safe(chunk, ret, chunk2, &len, NULL); fwrite(chunk2, 1, len, stdout); return 0; } else if(!strcmp(argv[1], "compress-block")) { unsigned char chunk[65536]; unsigned char chunk2[65536+2048]; int ret = fread(&chunk, 1, 65536, stdin); char tmp[LZO1X_1_MEM_COMPRESS]; lzo_uint len = 65536+2048; lzo1x_1_compress(chunk, ret, chunk2, &len, &tmp); fwrite(chunk2, 1, len, stdout); return 0; } else if(!strcmp(argv[1], "compress-block2")) { unsigned char chunk[65536]; unsigned char chunk2[65536+2048]; int ret = fread(&chunk, 1, 65536, stdin); char tmp[LZO1X_999_MEM_COMPRESS]; lzo_uint len = 65536+2048; lzo1x_999_compress(chunk, ret, chunk2, &len, &tmp); fwrite(chunk2, 1, len, stdout); return 0; } else if(!strcmp(argv[1], "calculate-hash")) { unsigned char chunk[65536]; int ret = fread(&chunk, 1, 65536, stdin); unsigned char c = phash(chunk, ret); printf("%02x\n", c); return 0; } else if(!strcmp(argv[1], "get-length")) { assert(argc==4); const char* dirname = argv[2]; const char* basename = argv[3]; int block_len = storage__get_block_size2(dirname, basename); long long int number_of_blocks = storage__get_number_of_blocks2(dirname, basename); printf("%lld\n", number_of_blocks*block_len); return 0; } else if(!strcmp(argv[1], "read-one-block")) { assert(argc==5); const char* dirname = argv[2]; const char* basename = argv[3]; long long int blocknum; sscanf(argv[4], "%lld", &blocknum); struct storage__file* f = storage__open(dirname, basename); int block_len = storage__get_block_size(f); unsigned char buf[65536]; storage__read_block(f, buf, blocknum); fwrite(buf, 1, block_len, stdout); return 0; } else { fprintf(stderr, "Unknown command %s\n", argv[1]); return 1; } }