msu getListOfFiles(os::Path p) { static ull sz = 0; string pstr = p.str(); string prefix = pstr + '/'; if (pstr.empty()) never(1); if (pstr == ".") prefix = ""; msu r; os::Dir d = os::FileSys::readDirEx(p, true, true); sz += d.files.size(); cout << sz << '\r' << std::flush; for ( auto f : d.files ) { if ( f.first[0] == '.' ) continue; r[prefix + f.first] = f.second; } for ( auto f : d.dirs ) { if ( f[0] == '.' ) continue; auto n = getListOfFiles(prefix + f); r.insert(n.begin(), n.end()); } return r; }
sam::mfu sam::getListOfFiles(os::Path p, bool dot) { static ull sz = 0; string pstr = p.str(); string prefix = pstr + '/'; if (pstr.empty()) never(1); if (pstr == ".") prefix = ""; mfu r; os::Dir d = os::FileSys::readDirEx(p, true, true); sz += d.files.size(); cout << sz << '\r' << std::flush; for ( auto f : d.files ) { if ( !dot && f.first[0] == '.' ) continue; string n = prefix + f.first; r[File {prefix, f.first, os::FileSys::mtime(n), ull(f.second)}] = f.second; } for ( auto f : d.dirs ) { if ( !dot && f[0] == '.' ) continue; auto n = getListOfFiles(prefix + f, dot); r.insert(n.begin(), n.end()); } return r; }
QStringList Helpers::getListOfFilesRecursively(const QString directory) const { const QStringList dirs = getListOfDirs(directory) << directory; QStringListIterator iter(dirs); QStringList allFiles; while (iter.hasNext()) allFiles += getListOfFiles(iter.next()); return allFiles; }
void mymain(ull sz) { // idea - first get the list of files // remove files with unique size and with zero size // do not read '.'-directories // place remaining into size buckets // for each bucket read first 100 bytes // eliminate uniques // set SZ = 1k // in each size bucket read SZ and hash // eliminate with unique hashes // those that size<SZ declare SAME // make size-hash-buckets (SH) // set SZ *= 10 // repeat msu allfiles = getListOfFiles("."); cout << "Read: " << allfiles.size() << " files\n"; //for ( auto x : allfiles ) cout << x.first << '\t' << x.second << '\n'; cout << "Removing unique size and 0-size files\n"; { std::map<ull, ull> a; for ( auto x : allfiles ) a[x.second] += 1; msu b; for ( auto x : allfiles ) { ull s = x.second; if ( s == 0 ) continue; if ( a[s] == 1 ) continue; b.insert(x); } b.swap(allfiles); } cout << "Remained: " << allfiles.size() << " files\n"; //for ( auto x : allfiles ) cout << x.first << '\t' << x.second << '\n'; std::map<SH, vstr> buckets; for ( auto x : allfiles ) { SH s {x.second, ""}; buckets[s].push_back(x.first); } cout << "Made " << buckets.size() << " buckets\n"; //prn_buckets(buckets); // release allfiles allfiles.clear(); std::map<SH, vstr> same; for (ull SZ = 1; SZ <= sz && !buckets.empty(); SZ *= 10 ) { //count files ull fcnt = 0; for ( auto & buck : buckets ) for ( auto f : buck.second ) fcnt++; cout << "Checking " << fcnt << " files for size " << kb(SZ) << std::flush; std::map<SH, vstr> newb; ull cntr = 0; Timer timer; for ( auto & buck : buckets ) { for ( auto f : buck.second ) { SH h { buck.first.size, gethash(f, SZ) }; newb[h].push_back(f); ++cntr; //if ( ( cntr += SZ ) > PRN ) //{ cout << "." << std::flush; cntr = 0; } if ( timer.get() / 1000 > 1) { timer.init(); cout << "\rChecking " << cntr << "/" << fcnt << " files for size " << kb(SZ) << " " << std::flush; } } } cout << "\rChecked " << cntr << "/" << fcnt << " files for size " << kb(SZ) << std::flush; cout << " " << std::flush; buckets.clear(); for ( auto x : newb ) if ( x.second.size() > 1 ) { if ( SZ < x.first.size ) buckets.insert(x); else same.insert(x); } cout << "\n"; } // next SZ cout << "\n"; if ( !same.empty() ) { cout << "Same files found, "; prn_buckets(same); } else { cout << "No same files found"; if ( !buckets.empty() ) cout << ", BUT"; cout << "\n"; } if ( !buckets.empty() ) { cout << "Unchecked files exceeded " << kb(sz) << "\n"; prn_buckets(buckets); } }