static gpointer read_user (gpointer data) { char opt; while (!exit_read) { print_options (); do { opt = getchar (); if (exit_read) { break; } } while (opt == '\n'); run_options (opt); } return NULL; }
int main(int argc, char* argv[]) { std::string server; std::string port; std::string user; std::string folder; std::string password; fs::path from; fs::path to; bool inbox; //[Gmail]/Sent Mail po::options_description general_options("General"); general_options.add_options() ("help", "list options"); po::options_description file_options("Load"); file_options.add_options() ("save-raw", po::value<fs::path>(&to), "path to save the data (after download phase)"); po::options_description source_options("Download"); source_options.add_options() ("load", po::value<fs::path>(&from), "mail folder"); po::options_description run_options("Run"); po::options_description all_options("Email Topology Options"); all_options .add(general_options) .add(file_options) .add(source_options); if(argc < 2) { std::cout << all_options << std::endl; return 1; } po::variables_map vm; try { int options_style = po::command_line_style::default_style; po::store(po::parse_command_line(argc, argv, all_options, options_style), vm); po::notify(vm); } catch(std::exception& e) { std::cout << all_options << std::endl; std::cout << "Command line parsing failed: " << e.what() << std::endl; return 1; } if(vm.count("help")) { std::cout << all_options << std::endl; return 1; } email_id_bimap email_id; connectedness_graph cg; entity_map em; initial_group_partition_map igpm; message_id_set message_id; if(!vm.count("save-raw")) { std::cout << "you must specify --save-raw with a file name" << std::endl; return 1; } if(!vm.count("load") || !fs::exists(from) || !fs::is_directory(from)) { std::cout << "missing source data folder (or not a folder)" << std::endl; return 1; } std::vector<char> buffer(128 * 1024); std::string headers; std::cout << "loading " << from; for(fs::recursive_directory_iterator fe, fi(from); fi != fe; ++fi) { if(fs::is_directory(*fi)) continue; fs::ifstream in(*fi); headers.clear(); while(in.good()) { in.getline(&buffer[0], buffer.size()); std::string line = &buffer[0]; boost::algorithm::trim(line); if(line.empty()) break; headers += "\n"; headers += line; } members_t g; for(boost::sregex_iterator i(headers.begin(), headers.end(), re_terms), e; i != e; ++i) { const boost::smatch& what = *i; std::string field = what[1].str(); if(boost::algorithm::iequals(field, "Message-ID")) { std::string id = what[2].str(); boost::algorithm::trim(id); std::pair<message_id_set::iterator, bool> result = message_id.insert(id); //skip this duplicate message if(!result.second) { g.clear(); break; } } else if(boost::algorithm::iequals(field, "From") || boost::algorithm::iequals(field, "To") || boost::algorithm::iequals(field, "Cc")) { std::string data = what[2].str(); boost::replace_all(data, "\n", ""); boost::replace_all(data, "\r", ""); for(boost::sregex_iterator j(data.begin(), data.end(), re_email), e; j != e; ++j) { std::string name = (*j)[1].str(); if(name.empty()) name = (*j)[2].str(); std::string email_address = (*j)[3].str(); boost::algorithm::to_lower(email_address); boost::algorithm::trim(name); std::pair<email_id_bimap::map_by<email>::iterator, bool> result = email_id.by<email>().insert( email_id_bimap::map_by<email>::value_type(email_address, email_id.size())); if(result.second) std::cout << "@" << std::flush; if(!name.empty() && boost::to_lower_copy(name) != email_address) em[email_address].insert(name); g.insert(result.first->second); } } } if(g.empty()) { continue; } initial_group_partition_map::iterator r = igpm.find(g); if(r == igpm.end()) { connectedness_graph::vertex_descriptor node = gr::add_vertex(cg); cg[node].members = g; cg[node].weight = 1; igpm.insert(r, std::make_pair(g, node)); } else { connectedness_graph::vertex_descriptor node = r->second; cg[node].weight++; } std::cout << ". " << std::flush; } std::cout << std::endl; if(fs::exists(to)) fs::remove(to); fs::ofstream out(to); std::cout << "saving data to " << to.file_string(); for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) { out << (unsigned long long)cg[*i].weight; for(members_t::iterator j = cg[*i].members.begin(); j != cg[*i].members.end(); ++j) { out << "\t" << email_id.by<bit>().equal_range(*j).first->second; } out << std::endl; } out << "-" << std::endl; for(entity_map::iterator i = em.begin(); i != em.end(); ++i) { out << i->first; for(std::set<std::string>::iterator k = i->second.begin(); k != i->second.end(); ++k) { out << "\t" << *k; } out << std::endl; } return 0; }
int main(int argc, char* argv[]) { std::string server; std::string port; std::string user; std::string folder; std::string password; std::vector<fs::path> from; std::vector<fs::path> entity; fs::path to; //[Gmail]/Sent Mail po::options_description general_options("General"); general_options.add_options() ("help", "list options"); po::options_description file_options("Load"); file_options.add_options() ("save-raw", po::value<fs::path>(&to), "path to save the data (after download phase)"); po::options_description download_options("Download"); download_options.add_options() ("server", po::value<std::string>(&server), "imap server dns/ip") ("port", po::value<std::string>(&port)->default_value("993"), "imap port") ("folder", po::value<std::string>(&folder)->default_value("Sent"), "imap folder") ("user", po::value<std::string>(&user), "imap username") ("password", po::value<std::string>(&password), "imap password (will ask if not specified)"); po::options_description run_options("Run"); po::options_description all_options("Email Topology Options"); all_options .add(general_options) .add(file_options) .add(download_options); if(argc < 2) { std::cout << all_options << std::endl; return 1; } po::variables_map vm; try { int options_style = po::command_line_style::default_style; po::store(po::parse_command_line(argc, argv, all_options, options_style), vm); po::notify(vm); } catch(std::exception& e) { std::cout << all_options << std::endl; std::cout << "Command line parsing failed: " << e.what() << std::endl; return 1; } if(vm.count("help")) { std::cout << all_options << std::endl; return 1; } email_id_bimap email_id; connectedness_graph cg; entity_map em; initial_group_partition_map igpm; if(!vm.count("save-raw")) { std::cout << "you must specify --save-raw with a file name" << std::endl; return 1; } if(!vm.count("password")) { password = getpass("Password: "******"missing server for download" << std::endl; return 1; } if(user.empty()) { std::cout << "missing user for download" << std::endl; return 1; } if(password.empty()) { std::cout << "missing user for download" << std::endl; return 1; } //this is our network block, downloads all messages headers try { std::cout << "downloading " << folder << " from " << server << std::endl; //use to dedupe if there are dupes message_id_set message_id; typedef boost::function<void (const std::string&, const std::list<std::string>& args)> untagged_handler; std::string pending_tag = "* "; std::list<std::string> pending_command; pending_command.push_back("WAIT_FOR_ACK"); untagged_handler pending_handler; unsigned int command_id = 0; //The sequence of imap commands we want to run std::list<std::list<std::string> > commands; std::list<untagged_handler> handlers; handlers.push_back(log_handler()); commands.push_back(std::list<std::string>()); std::ostringstream login_os; login_os << "LOGIN \"" << user << "\" {" << password.size() << "}"; commands.back().push_back(login_os.str()); commands.back().push_back(password); handlers.push_back(log_handler()); commands.push_back(std::list<std::string>()); commands.back().push_back("LIST \"\" *"); handlers.push_back(log_handler()); commands.push_back(std::list<std::string>()); commands.back().push_back("SELECT \"" + folder + "\""); handlers.push_back(header_handler(email_id, cg, em, message_id, igpm)); commands.push_back(std::list<std::string>()); commands.back().push_back("FETCH 1:* (BODY.PEEK[HEADER.FIELDS (MESSAGE-ID FROM TO CC)])"); commands.push_back(std::list<std::string>()); handlers.push_back(log_handler()); commands.back().push_back("LOGOUT"); //open ssl connection to the server, no cert checking asio::io_service io_service; asio::ip::tcp::resolver resolver(io_service); asio::ip::tcp::resolver::query query(server, port); asio::ip::tcp::resolver::iterator iterator = resolver.resolve(query); asio::ssl::context context(io_service, asio::ssl::context::sslv23); context.set_verify_mode(asio::ssl::context::verify_none); asio::ssl::stream<asio::ip::tcp::socket> socket(io_service, context); socket.lowest_layer().connect(*iterator); socket.handshake(asio::ssl::stream_base::client); asio::streambuf buf; while(true) { //read the next line of data std::size_t line_length = asio::read_until(socket, buf, re_crlf); std::string line( asio::buffers_begin(buf.data()), asio::buffers_begin(buf.data()) + line_length); buf.consume(line_length); boost::match_results<std::string::iterator> what; std::size_t initial = 0; std::list<std::string> args; //the line may be split into segments with chunks of data embedded, this is the case //for bodies or message header blocks that are returned, we only handle this case if it //comes in untagged response (*) not a continuation (+), i think that is normal while(regex_search(line.begin() + initial, line.end(), what, re_byte_buffer, boost::match_default)) { unsigned int bytes = boost::lexical_cast<unsigned int>(what[1].str()); if(buf.size() < bytes) asio::read(socket, buf, asio::transfer_at_least(bytes - buf.size())); args.push_back( std::string( asio::buffers_begin(buf.data()), asio::buffers_begin(buf.data()) + bytes)); buf.consume(bytes); line.resize(what[1].second - line.begin()); initial = line.size(); //read the next line of data line_length = asio::read_until(socket, buf, re_crlf); line += std::string( asio::buffers_begin(buf.data()), asio::buffers_begin(buf.data()) + line_length); buf.consume(line_length); } if(boost::algorithm::starts_with(line, pending_tag)) { //if the command is being completed, then we will go here, bail out if the response wasn't ok if(!boost::algorithm::starts_with(line, pending_tag + "OK")) { std::cout << line; throw std::runtime_error("command failed"); } //pull the next command off the list pending_tag = "A" + boost::lexical_cast<std::string>(command_id++) + " "; if(commands.size() == 0) break; pending_handler = handlers.front(); pending_command = commands.front(); commands.pop_front(); handlers.pop_front(); //send the command along with any data arguments std::cout << pending_tag << pending_command.front() << std::endl; asio::write(socket, asio::buffer(pending_tag.data(), pending_tag.size())); for(std::list<std::string>::iterator i = pending_command.begin(); i != pending_command.end(); ++i) { if(i != pending_command.begin()) { //print the continuation response std::size_t line_length = asio::read_until(socket, buf, re_crlf); std::string line( asio::buffers_begin(buf.data()), asio::buffers_begin(buf.data()) + line_length); buf.consume(line_length); std::cout << line << std::flush; if(!boost::algorithm::starts_with(line, "+ ")) { throw std::runtime_error("bad response when writing extra data"); } } else { //print it out as well (but not the args) std::cout << *i << std::endl; } asio::write(socket, asio::buffer(i->data(), i->size())); asio::write(socket, asio::buffer("\r\n", 2)); } } else if(boost::algorithm::starts_with(line, "* ")) { //if there is a registered handler, dispatch to it if(pending_handler) pending_handler(line, args); } else { throw std::runtime_error("unrecognized response"); } } } catch (std::exception& e) { std::cout << "Exception: " << e.what() << std::endl; return 1; } std::cout << std::endl; if(to.empty()) { std::cout << "Missing output file for save" << std::endl; return 1; } if(fs::exists(to)) fs::remove(to); fs::ofstream out(to); std::cout << "saving data to " << to.file_string(); for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) { out << (unsigned long long)cg[*i].weight; for(members_t::iterator j = cg[*i].members.begin(); j != cg[*i].members.end(); ++j) { out << "\t" << email_id.by<bit>().equal_range(*j).first->second; } out << std::endl; } out << "-" << std::endl; for(entity_map::iterator i = em.begin(); i != em.end(); ++i) { out << i->first; for(std::set<std::string>::iterator k = i->second.begin(); k != i->second.end(); ++k) { out << "\t" << *k; } out << std::endl; } return 0; }
int main(int argc, char* argv[]) { std::vector<fs::path> from; std::vector<fs::path> entity; std::string ignore_string, save_base; unsigned int threshold; unsigned int person_threshold; bool no_individuals; bool remove_most_common; std::vector<unsigned int> save_at_v; std::set<unsigned int> save_at; //[Gmail]/Sent Mail po::options_description general_options("General"); general_options.add_options() ("help", "list options"); po::options_description file_options("Load"); file_options.add_options() ("ignore", po::value<std::string>(&ignore_string)->default_value("@lists\\.|@googlegroups\\.|@yahoogroups\\.|@mailman\\.|@facebookmail\\.|noreply|do[-_]not[-_]reply|^buzz\\+"), "ignore messages with a recipient matching this expression") ("entity-raw", po::value<std::vector<fs::path> >(&entity), "paths to load data ONLY for entities") ("load-raw", po::value<std::vector<fs::path> >(&from), "paths to load data from"); po::options_description run_options("Export Options"); run_options.add_options() ("save", po::value<std::string>(&save_base), "base path to save the data at") ("remove-most-common", po::value<bool>(&remove_most_common)->default_value(1), "remove the most common individual (owner)") ("no-individuals", po::value<bool>(&no_individuals)->default_value(0), "ignore individuals") ("threshold", po::value<unsigned int>(&threshold)->default_value(1), "minimum mails for group") ("person-threshold", po::value<unsigned int>(&person_threshold)->default_value(2), "minimum mails for person"); po::options_description all_options("Email Topology Options"); all_options .add(general_options) .add(file_options) .add(run_options); if(argc < 2) { std::cout << all_options << std::endl; return 1; } po::variables_map vm; try { int options_style = po::command_line_style::default_style; po::store(po::parse_command_line(argc, argv, all_options, options_style), vm); po::notify(vm); } catch(std::exception& e) { std::cout << all_options << std::endl; std::cout << "Command line parsing failed: " << e.what() << std::endl; return 1; } if(vm.count("help")) { std::cout << all_options << std::endl; return 1; } std::copy(save_at_v.begin(), save_at_v.end(), std::inserter(save_at, save_at.end())); email_id_bimap email_id; connectedness_graph cg; initial_group_partition_map igpm; entity_map em; if(!vm.count("load-raw")) { std::cout << "must load something" << std::endl; return 1; } if(!vm.count("save")) { std::cout << "must save something" << std::endl; return 1; } std::size_t max_id = 0; std::vector<char> buffer(128 * 1024); try { boost::regex re_ignore(ignore_string); boost::regex re_loader("([^\t]+)"); std::cout << "resolving entities" << std::endl; for(std::vector<fs::path>::iterator i = entity.begin(); i != entity.end(); ++i) { if(!fs::exists(*i)) throw std::runtime_error(std::string("input file not found: ") + i->file_string()); std::cout << "loading " << i->file_string(); fs::ifstream in(*i); //we don't care about messages here while(in.good()) { in.getline(&buffer[0], buffer.size()); std::string line = &buffer[0]; boost::algorithm::trim(line); if(line == "-") { break; } bool first = true; for(boost::sregex_iterator j(line.begin(), line.end(), re_loader), e; j != e; ++j) { if(first) { first = false; } else { std::string email_address = (*j)[0].str(); if(regex_search(email_address, re_ignore)) { continue; } std::pair<email_id_bimap::map_by<email>::iterator, bool> result = email_id.by<email>().insert( email_id_bimap::map_by<email>::value_type(email_address, email_id.size())); if(result.second) std::cout << "@" << std::flush; } } } while(in.good()) { in.getline(&buffer[0], buffer.size()); std::string line = &buffer[0]; boost::algorithm::trim(line); std::string email_address; bool first = true; for(boost::sregex_iterator j(line.begin(), line.end(), re_loader), e; j != e; ++j) { if(first) { first = false; email_address = (*j)[0].str(); if(regex_search(email_address, re_ignore)) { break; } } else { std::string name = (*j)[0].str(); try { em[email_address].insert(name); } catch(std::exception& e) { std::cout << "err missing: " << email_address << std::endl; throw; } } } } std::cout << std::endl; } resolve_entities(em, email_id); for(std::vector<fs::path>::iterator i = from.begin(); i != from.end(); ++i) { if(!fs::exists(*i)) throw std::runtime_error(std::string("input file not found: ") + i->file_string()); std::cout << "loading " << i->file_string(); fs::ifstream in(*i); while(in.good()) { in.getline(&buffer[0], buffer.size()); std::string line = &buffer[0]; boost::algorithm::trim(line); if(line == "-") { break; } members_t g; unsigned int count = 0; bool first = true; for(boost::sregex_iterator j(line.begin(), line.end(), re_loader), e; j != e; ++j) { if(first) { first = false; std::string number = (*j)[0].str(); count = boost::lexical_cast<unsigned int>(number); } else { std::string email_address = (*j)[0].str(); if(regex_search(email_address, re_ignore)) { g.clear(); continue; } std::pair<email_id_bimap::map_by<email>::iterator, bool> result = email_id.by<email>().insert( email_id_bimap::map_by<email>::value_type(email_address, email_id.size())); if(result.second) std::cout << "@" << std::flush; g.insert(result.first->second); } } if(g.empty()) { //no emails? wtfs continue; } initial_group_partition_map::iterator r = igpm.find(g); if(r == igpm.end()) { connectedness_graph::vertex_descriptor node = gr::add_vertex(cg); cg[node].members = g; cg[node].weight = count; igpm.insert(r, std::make_pair(g, node)); } else { connectedness_graph::vertex_descriptor node = r->second; cg[node].weight += count; } std::cout << "." << std::flush; } //no need to load em std::cout << std::endl; } max_id = email_id.size(); } catch(std::exception& e) { std::cout << "failed to load data: " << e.what() << std::endl; return 1; } std::map<unsigned int, score_t> ppl; for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second;) { if(cg[*i].weight >= threshold) { for(members_t::iterator j = cg[*i].members.begin(); j != cg[*i].members.end(); ++j) { ppl[*j] += cg[*i].weight; } ++i; } else { connectedness_graph::vertex_iterator to_erase = i++; gr::clear_vertex(*to_erase, cg); gr::remove_vertex(*to_erase, cg); } } //remove the owner, todo, this is evil because now there are dupe groups if A was owner and A B C and B C existed if(!ppl.empty()) { if(remove_most_common) { unsigned int max_person = ppl.begin()->first; score_t max_val = ppl.begin()->second; for(std::map<unsigned int, score_t>::iterator j = ppl.begin(); j != ppl.end(); ++j) { if(j->second > max_val) { max_val = j->second; max_person = j->first; } } for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second;) { cg[*i].members.erase(max_person); if(cg[*i].members.empty()) { connectedness_graph::vertex_iterator to_delete = i; ++i; gr::clear_vertex(*to_delete, cg); gr::remove_vertex(*to_delete, cg); } else { ++i; } } } for(std::map<unsigned int, score_t>::iterator j = ppl.begin(); j != ppl.end();) { if(j->second >= person_threshold) { std::map<unsigned int, score_t>::iterator to_delete = j++; ppl.erase(to_delete); } else { ++j; } } for(std::map<unsigned int, score_t>::iterator j = ppl.begin(); j != ppl.end(); ++j) { for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second;) { cg[*i].members.erase(j->first); if(cg[*i].members.empty()) { connectedness_graph::vertex_iterator to_delete = i; ++i; gr::clear_vertex(*to_delete, cg); gr::remove_vertex(*to_delete, cg); } else { ++i; } } } } if(no_individuals) { for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second;) { if(cg[*i].members.size() > 1) { ++i; } else { connectedness_graph::vertex_iterator to_erase = i++; gr::clear_vertex(*to_erase, cg); gr::remove_vertex(*to_erase, cg); } } } //normalize group weights for large groups for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) { if(cg[*i].members.size() < 20) continue; cg[*i].weight *= score_t(20) / cg[*i].members.size(); } unsigned int vertex_number = 0; for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) { cg[*i].index = vertex_number++; } std::cout << "converting to person graph" << std::endl; people_graph pg; std::map<unsigned int, people_graph::vertex_descriptor> remaining_people; for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) { group& g = cg[*i]; for(members_t::const_iterator j = g.members.begin(); j != g.members.end(); ++j) { //if there is a new person represented add them to the map std::pair<std::map<unsigned int, people_graph::vertex_descriptor>::iterator, bool> res = remaining_people.insert(std::make_pair(*j, people_graph::vertex_descriptor())); if(res.second) { res.first->second = gr::add_vertex(pg); person& p = pg[res.first->second]; p.id = res.first->first; p.name = email_id.by<bit>().equal_range(p.id).first->second; } } } for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) { group& g = cg[*i]; for(members_t::const_iterator j = g.members.begin(); j != g.members.end(); ++j) { members_t::const_iterator k = j; for(++k; k != g.members.end(); ++k) { //duplicates eliminated by setS type container people_graph::edge_descriptor l = gr::add_edge(remaining_people[*j], remaining_people[*k], pg).first; edge& e = pg[l]; e.weight += g.weight; } } } fs::path path(save_base); if(fs::exists(path)) fs::remove(path); fs::ofstream out(path); gr::dynamic_properties dp; dp.property("label", get(&person::name, pg)); dp.property("weight", gr::get(&edge::weight, pg)); gr::write_graphml(out, pg, gr::get(&person::id, pg), dp, false); return 0; }