-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.cpp
179 lines (159 loc) · 5.67 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#include <iostream>
#include <algorithm>
#include <boost/iostreams/device/mapped_file.hpp>
#include <fstream>
#include <map>
#include <string>
#include <set>
#include <sstream>
#include <boost/filesystem.hpp>
#include <boost/integer.hpp>
#include <boost/crc.hpp>
#include <boost/iostreams/device/file.hpp>
#include <boost/iostreams/stream.hpp>
#include <boost/algorithm/string/join.hpp>
namespace io = boost::iostreams;
namespace fs = boost::filesystem;
///////////////////////////////////////////////////////////////////////////
// DATA DEF
///////////////////////////////////////////////////////////////////////////
// max amount of byte we read from the file to compute the CRC.
std::streamsize const buffer_size = 2048;
struct file_cmp_options_t {
file_cmp_options_t() : file_name(true), file_data(false) {}
bool file_name;
bool file_data;
};
typedef std::multimap<boost::int32_t, fs::path> crc32_to_path_t;
typedef std::map<fs::path, std::vector<fs::path>> equals_t;
typedef std::map<std::string, std::set<std::string> > reduce_t;
struct work_package_t {
file_cmp_options_t file_cmp_options;
crc32_to_path_t crc32_to_path;
equals_t equals;
reduce_t reduce;
};
struct crc_failed_t : public boost::exception, public std::exception {
const char *what() const noexcept { return "CRC computation failed"; }
};
///////////////////////////////////////////////////////////////////////////
// CODE
///////////////////////////////////////////////////////////////////////////
// depending on the options, CRC32 will be either computed using the file name, or by using the data withing the file.
boost::int32_t file_crc32(const file_cmp_options_t& options, const fs::path& file_path) {
boost::crc_32_type result;
if (options.file_name && !options.file_data) {
std::string fileName = file_path.filename().string();
result.process_bytes(fileName.c_str(), fileName.size());
} else {
std::ifstream input(file_path.c_str(), std::ios::binary);
if (input) {
char buffer[ buffer_size ];
input.read( buffer, buffer_size );
result.process_bytes( buffer, input.gcount() );
} else {
throw crc_failed_t();
}
}
return result.checksum();
}
// tells if two file are equal, by, depending on the options, comparing the file names and/or comparing the raw data.
bool file_compare(const file_cmp_options_t& options, const fs::path& left_path, const fs::path& right_path) {
bool is_equal = true;
if (options.file_name) {
is_equal = left_path.filename() == right_path.filename();
}
if (options.file_data) {
try {
io::mapped_file_source left(left_path);
io::mapped_file_source right(right_path);
// visual warning here is not valid.
is_equal &= (left.size() == right.size())
&& std::equal(left.data(), left.data() + left.size(), right.data());
} catch (boost::exception& ) {
return false;
}
}
return is_equal;
}
// find out if the file is duplicated and store the data as needed.
void check_equals(work_package_t &work_package, const fs::path& path) {
auto crc32 = file_crc32(work_package.file_cmp_options, path);
auto range = work_package.crc32_to_path.equal_range(crc32);
bool need_to_add = true;
for (auto it = range.first; it != range.second; ++it) {
auto cmp_path = it->second;
if (file_compare(work_package.file_cmp_options, cmp_path, path)) {
need_to_add = false;
auto equal_it = work_package.equals.find(cmp_path);
if (equal_it == work_package.equals.end()) {
work_package.equals[cmp_path].push_back(cmp_path);
}
work_package.equals[cmp_path].push_back(path);
}
}
if (need_to_add) {
work_package.crc32_to_path.insert(std::make_pair(crc32, path));
}
}
void find_duplicate(const fs::path& p, work_package_t& work_package) {
fs::recursive_directory_iterator it(p);
fs::recursive_directory_iterator end;
while (it != end) {
auto entry = *it++;
auto entryPath = entry.path();
if (fs::is_regular_file(entryPath)) {
try {
// find if we have some files that have the same CRC
check_equals(work_package, entryPath);
} catch(crc_failed_t&) {
// means we were not able to read the file, skip it ( might want to stderr )
}
}
}
}
void reduce_result(work_package_t& work_package) {
for (auto it = work_package.equals.begin(); it != work_package.equals.end(); ++it) {
auto paths = it->second;
std::set<std::string> names;
std::set<std::string> directories;
for (auto it_path = paths.begin(); it_path != paths.end(); ++it_path) {
const fs::path& path = (*it_path);
directories.insert(fs::canonical(path.parent_path()).string());
names.insert(path.filename().string());
}
auto joined_names = "\t" + boost::algorithm::join(names, " == ");
auto joined_directories = boost::algorithm::join(directories, "\n");
work_package.reduce[joined_directories].insert(joined_names);
}
}
void print_reduced_result(work_package_t& work_package) {
for (auto it = work_package.reduce.begin(); it != work_package.reduce.end(); ++it) {
std::cout << it->first << std::endl;
std::cout << boost::algorithm::join(it->second, "\n") << std::endl;
}
}
int main(int argc, char* argv[]) {
if (argc > 1) {
work_package_t work_package;
for (int i = 1; i < argc; ++i) {
if (strcmp("-data", argv[i]) == 0)
work_package.file_cmp_options.file_data = false;
else if (strcmp("-name", argv[i]) == 0)
work_package.file_cmp_options.file_name = false;
else if (strcmp("+data", argv[i]) == 0)
work_package.file_cmp_options.file_data = true;
else if (strcmp("+name", argv[i]) == 0)
work_package.file_cmp_options.file_name = true;
else {
fs::path p(argv[i]);
if (fs::exists(p)) {
find_duplicate(p, work_package);
}
}
}
reduce_result(work_package);
print_reduced_result(work_package);
}
return 0;
}