int main(int argc, char* argv[])
{
    std::string server;
    std::string port;
    std::string user;
    std::string folder;
    std::string password;
    fs::path from;
    fs::path to;
    bool inbox;
    
    //[Gmail]/Sent Mail
    po::options_description general_options("General");
    general_options.add_options()
        ("help", "list options");
    po::options_description file_options("Load");
    file_options.add_options()
        ("save-raw", po::value<fs::path>(&to), "path to save the data (after download phase)");
    po::options_description source_options("Download");
    source_options.add_options()
        ("load", po::value<fs::path>(&from), "mail folder");
        
    po::options_description run_options("Run");
    
    po::options_description all_options("Email Topology Options");
    all_options
        .add(general_options)
        .add(file_options)
        .add(source_options);

    if(argc < 2) {
        std::cout << all_options << std::endl;
        return 1;
    }

    po::variables_map vm;
    try {
        int options_style = po::command_line_style::default_style;
        po::store(po::parse_command_line(argc, argv, all_options, options_style), vm);
        po::notify(vm);
    } catch(std::exception& e) {
        std::cout << all_options << std::endl;
        std::cout << "Command line parsing failed: " << e.what() << std::endl;
        return 1;
    }
    
    if(vm.count("help")) {
        std::cout << all_options << std::endl;
        return 1;
    }

    email_id_bimap email_id;
    connectedness_graph cg;
    entity_map em;
    initial_group_partition_map igpm;
    message_id_set message_id;

    if(!vm.count("save-raw")) {
        std::cout << "you must specify --save-raw with a file name" << std::endl;
        return 1;
    }
    if(!vm.count("load") || !fs::exists(from) || !fs::is_directory(from)) {
        std::cout << "missing source data folder (or not a folder)" << std::endl;
        return 1;
    }
    std::vector<char> buffer(128 * 1024);
    std::string headers;
    std::cout << "loading " << from;
    for(fs::recursive_directory_iterator fe, fi(from); fi != fe; ++fi) {
        if(fs::is_directory(*fi))
            continue;
        fs::ifstream in(*fi);    
        headers.clear();
        while(in.good()) {
            in.getline(&buffer[0], buffer.size());
            std::string line = &buffer[0];
            boost::algorithm::trim(line);
            if(line.empty()) 
                break;
            headers += "\n";
            headers += line;
        }
        members_t g;
        for(boost::sregex_iterator i(headers.begin(), headers.end(), re_terms), e; i != e; ++i) {
            const boost::smatch& what = *i;
            std::string field = what[1].str();
            if(boost::algorithm::iequals(field, "Message-ID")) {
                std::string id = what[2].str();
                boost::algorithm::trim(id);
                std::pair<message_id_set::iterator, bool> result = message_id.insert(id);
                //skip this duplicate message
                if(!result.second) {
                    g.clear();
                    break;
                }

            } else if(boost::algorithm::iequals(field, "From") || boost::algorithm::iequals(field, "To") || boost::algorithm::iequals(field, "Cc")) {
                std::string data = what[2].str();
                boost::replace_all(data, "\n", "");
                boost::replace_all(data, "\r", "");
                for(boost::sregex_iterator j(data.begin(), data.end(), re_email), e; j != e; ++j) {
                    std::string name = (*j)[1].str();
                    if(name.empty())
                        name = (*j)[2].str();
                    std::string email_address = (*j)[3].str();
                    boost::algorithm::to_lower(email_address);
                    boost::algorithm::trim(name);
                    std::pair<email_id_bimap::map_by<email>::iterator, bool> result = email_id.by<email>().insert(
                        email_id_bimap::map_by<email>::value_type(email_address, email_id.size()));
                    if(result.second)
                        std::cout << "@" << std::flush;
                    if(!name.empty() && boost::to_lower_copy(name) != email_address)
                        em[email_address].insert(name);
                    g.insert(result.first->second);
                }
            }
        }
        if(g.empty()) {
            continue;
        }

        initial_group_partition_map::iterator r = igpm.find(g);
        if(r == igpm.end()) {
            connectedness_graph::vertex_descriptor node = gr::add_vertex(cg);
            cg[node].members = g;
            cg[node].weight = 1;
            igpm.insert(r, std::make_pair(g, node));
        } else {
            connectedness_graph::vertex_descriptor node = r->second;
            cg[node].weight++;
        }
        std::cout << ". " <<  std::flush;
    }
    std::cout << std::endl;
    if(fs::exists(to))
        fs::remove(to);
    fs::ofstream out(to);
    std::cout << "saving data to " << to.file_string();        
    for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) {
        out << (unsigned long long)cg[*i].weight;
        for(members_t::iterator j = cg[*i].members.begin(); j != cg[*i].members.end(); ++j) {
            out << "\t" << email_id.by<bit>().equal_range(*j).first->second;
        }
        out << std::endl;
    }
    out << "-" << std::endl;
    for(entity_map::iterator i = em.begin(); i != em.end(); ++i) {
        out << i->first;
        for(std::set<std::string>::iterator k = i->second.begin(); k != i->second.end(); ++k) {
            out << "\t" << *k; 
        }
        out << std::endl;
    }
    return 0;
}
Exemple #2
0
static bool test_recursive(ilzham &lzham_dll, const char *pPath, comp_options options)
{
   string_array files;
   if (!find_files(pPath, "*", files, true))
   {
      print_error("Failed finding files under path \"%s\"!\n", pPath);
      return false;
   }

   uint total_files_compressed = 0;
   uint64 total_source_size = 0;
   uint64 total_comp_size = 0;

#ifdef WIN32
   MEMORYSTATUS initial_mem_status;
   GlobalMemoryStatus(&initial_mem_status);
#endif

   timer_ticks start_tick_count = timer::get_ticks();

   const int first_file_index = 0;

   uint unique_id = static_cast<uint>(timer::get_init_ticks());
   char cmp_file[256], decomp_file[256];

#ifdef _XBOX
   sprintf(cmp_file, "e:\\__comp_temp_%u__.tmp", unique_id);
   sprintf(decomp_file, "e:\\__decomp_temp_%u__.tmp", unique_id);
#else
   sprintf(cmp_file, "__comp_temp_%u__.tmp", unique_id);
   sprintf(decomp_file, "__decomp_temp_%u__.tmp", unique_id);
#endif

   for (uint file_index = first_file_index; file_index < files.size(); file_index++)
   {
      const std::string &src_file = files[file_index];

      printf("***** [%u of %u] Compressing file \"%s\" to \"%s\"\n", 1 + file_index, (uint)files.size(), src_file.c_str(), cmp_file);

      FILE *pFile = fopen(src_file.c_str(), "rb");
      if (!pFile)
      {
         printf("Skipping unreadable file \"%s\"\n", src_file.c_str());
         continue;
      }
      fseek(pFile, 0, SEEK_END);
      int64 src_file_size = _ftelli64(pFile);
      fclose(pFile);

      if (!ensure_file_is_writable(cmp_file))
      {
         print_error("Unable to create file \"%s\"!\n", cmp_file);
         return false;
      }

      comp_options file_options(options);
      if (options.m_randomize_params)
      {
         file_options.m_comp_level = static_cast<lzham_compress_level>(rand() % LZHAM_TOTAL_COMP_LEVELS);
         file_options.m_dict_size_log2 = LZHAM_MIN_DICT_SIZE_LOG2 + (rand() % (LZHAMTEST_MAX_POSSIBLE_DICT_SIZE - LZHAM_MIN_DICT_SIZE_LOG2 + 1));
         file_options.m_max_helper_threads = rand() % (LZHAM_MAX_HELPER_THREADS + 1);
         file_options.m_unbuffered_decompression = (rand() & 1) != 0;
#if !LZHAMTEST_NO_RANDOM_EXTREME_PARSING
         file_options.m_extreme_parsing = (rand() & 1) != 0;
#endif
         file_options.m_force_polar_codes = (rand() & 1) != 0;
         file_options.m_deterministic_parsing = (rand() & 1) != 0;

         file_options.print();
      }

      bool status = compress_streaming(lzham_dll, src_file.c_str(), cmp_file, file_options);
      if (!status)
      {
         print_error("Failed compressing file \"%s\" to \"%s\"\n", src_file.c_str(), cmp_file);
         return false;
      }

      if (file_options.m_verify_compressed_data)
      {
         printf("Decompressing file \"%s\" to \"%s\"\n", cmp_file, decomp_file);

         if (!ensure_file_is_writable(decomp_file))
         {
            print_error("Unable to create file \"%s\"!\n", decomp_file);
            return false;
         }

         status = decompress_file(lzham_dll, cmp_file, decomp_file, file_options);
         if (!status)
         {
            print_error("Failed decompressing file \"%s\" to \"%s\"\n", src_file.c_str(), decomp_file);
            return false;
         }

         printf("Comparing file \"%s\" to \"%s\"\n", decomp_file, src_file.c_str());

         if (!compare_files(decomp_file, src_file.c_str()))
         {
            print_error("Failed comparing decompressed file data while compressing \"%s\" to \"%s\"\n", src_file.c_str(), cmp_file);
            return false;
         }
         else
         {
            printf("Decompressed file compared OK to original file.\n");
         }
      }

      int64 cmp_file_size = 0;
      pFile = fopen(cmp_file, "rb");
      if (pFile)
      {
         fseek(pFile, 0, SEEK_END);
         cmp_file_size = _ftelli64(pFile);
         fclose(pFile);
      }

      total_files_compressed++;
      total_source_size += src_file_size;
      total_comp_size += cmp_file_size;

#ifdef WIN32
      MEMORYSTATUS mem_status;
      GlobalMemoryStatus(&mem_status);

#ifdef _XBOX
      const int64 bytes_allocated = initial_mem_status.dwAvailPhys - mem_status.dwAvailPhys;
#else
      const int64 bytes_allocated = initial_mem_status.dwAvailVirtual- mem_status.dwAvailVirtual;
#endif

      printf("Memory allocated relative to first file: %I64i\n", bytes_allocated);
#endif

      printf("\n");
   }

   timer_ticks end_tick_count = timer::get_ticks();

   double total_elapsed_time = timer::ticks_to_secs(end_tick_count - start_tick_count);

   printf("Test successful: %f secs\n", total_elapsed_time);
   printf("Total files processed: %u\n", total_files_compressed);
   printf("Total source size: " QUAD_INT_FMT "\n", total_source_size);
   printf("Total compressed size: " QUAD_INT_FMT "\n", total_comp_size);

   remove(cmp_file);
   remove(decomp_file);

   return true;
}
int main(int argc, char* argv[])
{
    std::string server;
    std::string port;
    std::string user;
    std::string folder;
    std::string password;
    std::vector<fs::path> from;
    std::vector<fs::path> entity;
    fs::path to;
    
    //[Gmail]/Sent Mail
    po::options_description general_options("General");
    general_options.add_options()
        ("help", "list options");
    po::options_description file_options("Load");
    file_options.add_options()
        ("save-raw", po::value<fs::path>(&to), "path to save the data (after download phase)");
    po::options_description download_options("Download");
    download_options.add_options()
        ("server", po::value<std::string>(&server), "imap server dns/ip")
        ("port", po::value<std::string>(&port)->default_value("993"), "imap port")
        ("folder", po::value<std::string>(&folder)->default_value("Sent"), "imap folder")
        ("user", po::value<std::string>(&user), "imap username")
        ("password", po::value<std::string>(&password), "imap password (will ask if not specified)");
    po::options_description run_options("Run");
    
    po::options_description all_options("Email Topology Options");
    all_options
        .add(general_options)
        .add(file_options)
        .add(download_options);

    if(argc < 2) {
        std::cout << all_options << std::endl;
        return 1;
    }

    po::variables_map vm;
    try {
        int options_style = po::command_line_style::default_style;
        po::store(po::parse_command_line(argc, argv, all_options, options_style), vm);
        po::notify(vm);
    } catch(std::exception& e) {
        std::cout << all_options << std::endl;
        std::cout << "Command line parsing failed: " << e.what() << std::endl;
        return 1;
    }
    
    if(vm.count("help")) {
        std::cout << all_options << std::endl;
        return 1;
    }

    email_id_bimap email_id;
    connectedness_graph cg;
    entity_map em;
    initial_group_partition_map igpm;

    if(!vm.count("save-raw")) {
        std::cout << "you must specify --save-raw with a file name" << std::endl;
        return 1;
    }
    if(!vm.count("password")) {
        password = getpass("Password: "******"missing server for download" << std::endl;
        return 1;
    }
    if(user.empty()) {
        std::cout << "missing user for download" << std::endl;
        return 1;
    }
    if(password.empty()) {
        std::cout << "missing user for download" << std::endl;
        return 1;
    }
    //this is our network block, downloads all messages headers
    try
    {
        std::cout << "downloading " << folder << " from " << server << std::endl;
        //use to dedupe if there are dupes
        message_id_set message_id;        

        typedef boost::function<void (const std::string&, const std::list<std::string>& args)> untagged_handler;
        std::string pending_tag = "* ";
        std::list<std::string> pending_command;
        pending_command.push_back("WAIT_FOR_ACK");
        untagged_handler pending_handler;
        unsigned int command_id = 0;

        //The sequence of imap commands we want to run
        std::list<std::list<std::string> > commands;
        std::list<untagged_handler> handlers;

        handlers.push_back(log_handler());
        commands.push_back(std::list<std::string>());
        std::ostringstream login_os;
        login_os << "LOGIN \"" << user << "\" {" << password.size() << "}";
        commands.back().push_back(login_os.str()); 
        commands.back().push_back(password); 

        handlers.push_back(log_handler());
        commands.push_back(std::list<std::string>());
        commands.back().push_back("LIST \"\" *"); 

        handlers.push_back(log_handler());
        commands.push_back(std::list<std::string>());
        commands.back().push_back("SELECT \"" + folder + "\""); 

        handlers.push_back(header_handler(email_id, cg, em, message_id, igpm));
        commands.push_back(std::list<std::string>());
        commands.back().push_back("FETCH 1:* (BODY.PEEK[HEADER.FIELDS (MESSAGE-ID FROM TO CC)])");
        commands.push_back(std::list<std::string>());

        handlers.push_back(log_handler());
        commands.back().push_back("LOGOUT");
    
        //open ssl connection to the server, no cert checking
        asio::io_service io_service;
        asio::ip::tcp::resolver resolver(io_service);
        asio::ip::tcp::resolver::query query(server, port);
        asio::ip::tcp::resolver::iterator iterator = resolver.resolve(query);
        asio::ssl::context context(io_service, asio::ssl::context::sslv23);
        context.set_verify_mode(asio::ssl::context::verify_none);
        asio::ssl::stream<asio::ip::tcp::socket> socket(io_service, context);
        socket.lowest_layer().connect(*iterator);
        socket.handshake(asio::ssl::stream_base::client);
        asio::streambuf buf;

        while(true) {
            //read the next line of data
            std::size_t line_length = asio::read_until(socket, buf, re_crlf);
            std::string line(
                asio::buffers_begin(buf.data()),
                asio::buffers_begin(buf.data()) + line_length);
            buf.consume(line_length);
            boost::match_results<std::string::iterator> what;
            std::size_t initial = 0;
            std::list<std::string> args;
            //the line may be split into segments with chunks of data embedded, this is the case
            //for bodies or message header blocks that are returned, we only handle this case if it
            //comes in untagged response (*) not a continuation (+), i think that is normal
            while(regex_search(line.begin() + initial, line.end(), what, re_byte_buffer, boost::match_default)) {
                unsigned int bytes = boost::lexical_cast<unsigned int>(what[1].str());
                if(buf.size() < bytes)
                    asio::read(socket, buf, asio::transfer_at_least(bytes - buf.size()));
                args.push_back(
                    std::string(
                        asio::buffers_begin(buf.data()),
                        asio::buffers_begin(buf.data()) + bytes));
                buf.consume(bytes);
                line.resize(what[1].second - line.begin());
                initial = line.size();
                //read the next line of data
                line_length = asio::read_until(socket, buf, re_crlf);
                line += std::string(
                    asio::buffers_begin(buf.data()),
                    asio::buffers_begin(buf.data()) + line_length);
                buf.consume(line_length);
            }
            if(boost::algorithm::starts_with(line, pending_tag)) {
                //if the command is being completed, then we will go here, bail out if the response wasn't ok
                if(!boost::algorithm::starts_with(line, pending_tag + "OK")) {
                    std::cout << line;
                    throw std::runtime_error("command failed");
                }
                //pull the next command off the list
                pending_tag = "A" + boost::lexical_cast<std::string>(command_id++) + " ";
                if(commands.size() == 0)
                    break;
                pending_handler = handlers.front();
                pending_command = commands.front();
                commands.pop_front();
                handlers.pop_front();

                //send the command along with any data arguments
                std::cout << pending_tag << pending_command.front() << std::endl;
                asio::write(socket, asio::buffer(pending_tag.data(), pending_tag.size()));
                for(std::list<std::string>::iterator i = pending_command.begin(); i != pending_command.end(); ++i) {
                    if(i != pending_command.begin()) {
                        //print the continuation response
                        std::size_t line_length = asio::read_until(socket, buf, re_crlf);
                        std::string line(
                            asio::buffers_begin(buf.data()),
                            asio::buffers_begin(buf.data()) + line_length);
                        buf.consume(line_length);
                        std::cout << line << std::flush;
                        if(!boost::algorithm::starts_with(line, "+ ")) {
                            throw std::runtime_error("bad response when writing extra data");
                        }
                    } else {
                        //print it out as well (but not the args)
                        std::cout << *i << std::endl;
                    }
                    asio::write(socket, asio::buffer(i->data(), i->size()));
                    asio::write(socket, asio::buffer("\r\n", 2));
                }
            } else if(boost::algorithm::starts_with(line, "* ")) {
                //if there is a registered handler, dispatch to it
                if(pending_handler)
                    pending_handler(line, args);
            } else {
                throw std::runtime_error("unrecognized response");
            }
        }
    }
    catch (std::exception& e) {
        std::cout << "Exception: " << e.what() << std::endl;
        return 1;
    }
    std::cout << std::endl;
    
    if(to.empty()) {
        std::cout << "Missing output file for save" << std::endl;
        return 1;
    }
    if(fs::exists(to))
        fs::remove(to);
    fs::ofstream out(to);
    std::cout << "saving data to " << to.file_string();        
    for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) {
        out << (unsigned long long)cg[*i].weight;
        for(members_t::iterator j = cg[*i].members.begin(); j != cg[*i].members.end(); ++j) {
            out << "\t" << email_id.by<bit>().equal_range(*j).first->second;
        }
        out << std::endl;
    }
    out << "-" << std::endl;
    for(entity_map::iterator i = em.begin(); i != em.end(); ++i) {
        out << i->first;
        for(std::set<std::string>::iterator k = i->second.begin(); k != i->second.end(); ++k) {
            out << "\t" << *k; 
        }
        out << std::endl;
    }
    return 0;
}
int main(int argc, char* argv[])
{
    std::vector<fs::path> from;
    std::vector<fs::path> entity;
    std::string ignore_string, save_base;
    unsigned int threshold;
    unsigned int person_threshold;
    bool no_individuals;
    bool remove_most_common;
    std::vector<unsigned int> save_at_v;
    std::set<unsigned int> save_at;
    
    //[Gmail]/Sent Mail
    po::options_description general_options("General");
    general_options.add_options()
        ("help", "list options");
    po::options_description file_options("Load");
    file_options.add_options()
        ("ignore", po::value<std::string>(&ignore_string)->default_value("@lists\\.|@googlegroups\\.|@yahoogroups\\.|@mailman\\.|@facebookmail\\.|noreply|do[-_]not[-_]reply|^buzz\\+"), "ignore messages with a recipient matching this expression")
        ("entity-raw", po::value<std::vector<fs::path> >(&entity), "paths to load data ONLY for entities")
        ("load-raw", po::value<std::vector<fs::path> >(&from), "paths to load data from");
    po::options_description run_options("Export Options");
    run_options.add_options()
        ("save", po::value<std::string>(&save_base), "base path to save the data at")
        ("remove-most-common", po::value<bool>(&remove_most_common)->default_value(1), "remove the most common individual (owner)")
        ("no-individuals", po::value<bool>(&no_individuals)->default_value(0), "ignore individuals")
        ("threshold", po::value<unsigned int>(&threshold)->default_value(1), "minimum mails for group")
        ("person-threshold", po::value<unsigned int>(&person_threshold)->default_value(2), "minimum mails for person");
    
    po::options_description all_options("Email Topology Options");
    all_options
        .add(general_options)
        .add(file_options)
        .add(run_options);

    if(argc < 2) {
        std::cout << all_options << std::endl;
        return 1;
    }

    po::variables_map vm;
    try {
        int options_style = po::command_line_style::default_style;
        po::store(po::parse_command_line(argc, argv, all_options, options_style), vm);
        po::notify(vm);
    } catch(std::exception& e) {
        std::cout << all_options << std::endl;
        std::cout << "Command line parsing failed: " << e.what() << std::endl;
        return 1;
    }
    
    if(vm.count("help")) {
        std::cout << all_options << std::endl;
        return 1;
    }
    std::copy(save_at_v.begin(), save_at_v.end(), std::inserter(save_at, save_at.end()));

    email_id_bimap email_id;
    connectedness_graph cg;
    initial_group_partition_map igpm;
    entity_map em;

    if(!vm.count("load-raw")) {
        std::cout << "must load something" << std::endl;
        return 1;
    }
    if(!vm.count("save")) {
        std::cout << "must save something" << std::endl;
        return 1;
    }

    std::size_t max_id = 0;
    std::vector<char> buffer(128 * 1024);
    try {
        boost::regex re_ignore(ignore_string);
        boost::regex re_loader("([^\t]+)");
        std::cout << "resolving entities" << std::endl;
        for(std::vector<fs::path>::iterator i = entity.begin(); i != entity.end(); ++i) {
            if(!fs::exists(*i))
                throw std::runtime_error(std::string("input file not found: ") + i->file_string());
            std::cout << "loading " << i->file_string();
            fs::ifstream in(*i);
            //we don't care about messages here
            while(in.good()) {
                in.getline(&buffer[0], buffer.size());
                std::string line = &buffer[0];
                boost::algorithm::trim(line);
                if(line == "-") {
                    break;
                }
                bool first = true;
                for(boost::sregex_iterator j(line.begin(), line.end(), re_loader), e; j != e; ++j) {
                    if(first) {
                        first = false;
                    } else {
                        std::string email_address = (*j)[0].str();
                        if(regex_search(email_address, re_ignore)) {
                            continue;
                        }
                        std::pair<email_id_bimap::map_by<email>::iterator, bool> result = email_id.by<email>().insert(
                            email_id_bimap::map_by<email>::value_type(email_address, email_id.size()));
                        if(result.second)
                            std::cout << "@" << std::flush;
                    }
                }
            }
            while(in.good()) {
                in.getline(&buffer[0], buffer.size());
                std::string line = &buffer[0];
                boost::algorithm::trim(line);
                
                std::string email_address;
                bool first = true;
                for(boost::sregex_iterator j(line.begin(), line.end(), re_loader), e; j != e; ++j) {
                    if(first) {
                        first = false;
                        email_address = (*j)[0].str();
                        if(regex_search(email_address, re_ignore)) {
                            break;
                        }
                    } else {
                        std::string name = (*j)[0].str();
                        try {
                            em[email_address].insert(name);
                        } catch(std::exception& e) {
                            std::cout <<  "err missing: " << email_address << std::endl;
                            throw;
                        }
                    }
                }
            }
            std::cout << std::endl;
        }
        resolve_entities(em, email_id);
        for(std::vector<fs::path>::iterator i = from.begin(); i != from.end(); ++i) {
            if(!fs::exists(*i))
                throw std::runtime_error(std::string("input file not found: ") + i->file_string());
            std::cout << "loading " << i->file_string();
            fs::ifstream in(*i);
            while(in.good()) {
                in.getline(&buffer[0], buffer.size());
                std::string line = &buffer[0];
                boost::algorithm::trim(line);
                if(line == "-") {
                    break;
                }
                members_t g;
                unsigned int count = 0;
                bool first = true;
                for(boost::sregex_iterator j(line.begin(), line.end(), re_loader), e; j != e; ++j) {
                    if(first) {
                        first = false;
                        std::string number = (*j)[0].str();
                        count = boost::lexical_cast<unsigned int>(number);
                    } else {
                        std::string email_address = (*j)[0].str();
                        if(regex_search(email_address, re_ignore)) {
                            g.clear();
                            continue;
                        }
                        std::pair<email_id_bimap::map_by<email>::iterator, bool> result = email_id.by<email>().insert(
                            email_id_bimap::map_by<email>::value_type(email_address, email_id.size()));
                        if(result.second)
                            std::cout << "@" << std::flush;
                        g.insert(result.first->second);
                    }
                }

                if(g.empty()) {
                    //no emails? wtfs
                    continue;
                }
                initial_group_partition_map::iterator r = igpm.find(g);
                if(r == igpm.end()) {
                    connectedness_graph::vertex_descriptor node = gr::add_vertex(cg);
                    cg[node].members = g;
                    cg[node].weight = count;
                    igpm.insert(r, std::make_pair(g, node));
                } else {
                    connectedness_graph::vertex_descriptor node = r->second;
                    cg[node].weight += count;
                }
                std::cout << "." << std::flush;
            }
            //no need to load em
            std::cout << std::endl;
        }
        max_id = email_id.size();
    } catch(std::exception& e) {
        std::cout << "failed to load data: " << e.what() << std::endl;
        return 1;
    }

    std::map<unsigned int, score_t> ppl;
    for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second;) {
        if(cg[*i].weight >= threshold) {
            for(members_t::iterator j = cg[*i].members.begin(); j != cg[*i].members.end(); ++j) {
                ppl[*j] += cg[*i].weight;
            }
            ++i;
        } else {
            connectedness_graph::vertex_iterator to_erase = i++;
            gr::clear_vertex(*to_erase, cg);
            gr::remove_vertex(*to_erase, cg);
        }
    }
    //remove the owner, todo, this is evil because now there are dupe groups if A was owner and A B C and B C existed
    if(!ppl.empty()) {
        if(remove_most_common) {
            unsigned int max_person = ppl.begin()->first;
            score_t max_val = ppl.begin()->second;
            for(std::map<unsigned int, score_t>::iterator j = ppl.begin(); j != ppl.end(); ++j) {
                if(j->second > max_val) {
                    max_val = j->second;
                    max_person = j->first;
                }
            }
            for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second;) {
                cg[*i].members.erase(max_person);
                if(cg[*i].members.empty()) {
                    connectedness_graph::vertex_iterator to_delete = i;
                    ++i;
                    gr::clear_vertex(*to_delete, cg);
                    gr::remove_vertex(*to_delete, cg);
                } else {
                    ++i;
                }
            }
        }
        for(std::map<unsigned int, score_t>::iterator j = ppl.begin(); j != ppl.end();) {
            if(j->second >= person_threshold) {
                std::map<unsigned int, score_t>::iterator to_delete = j++;
                ppl.erase(to_delete);
            } else {
                ++j;
            }
        }
        for(std::map<unsigned int, score_t>::iterator j = ppl.begin(); j != ppl.end(); ++j) {
            for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second;) {
                cg[*i].members.erase(j->first);
                if(cg[*i].members.empty()) {
                    connectedness_graph::vertex_iterator to_delete = i;
                    ++i;
                    gr::clear_vertex(*to_delete, cg);
                    gr::remove_vertex(*to_delete, cg);
                } else {
                    ++i;
                }
            }
        }
    }
    if(no_individuals) {
        for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second;) {
            if(cg[*i].members.size() > 1) {
                ++i;
            } else {
                connectedness_graph::vertex_iterator to_erase = i++;
                gr::clear_vertex(*to_erase, cg);
                gr::remove_vertex(*to_erase, cg);
            }
        }
    }
    //normalize group weights for large groups
    for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) {
        if(cg[*i].members.size() < 20) 
            continue;
        cg[*i].weight *= score_t(20) / cg[*i].members.size();
    }
    unsigned int vertex_number = 0;
    for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) {
        cg[*i].index = vertex_number++;
    }
    
    std::cout << "converting to person graph" << std::endl;
    people_graph pg;
    std::map<unsigned int, people_graph::vertex_descriptor> remaining_people;
    for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) {
        group& g = cg[*i];
        for(members_t::const_iterator j = g.members.begin(); j != g.members.end(); ++j) {
            //if there is a new person represented add them to the map
            std::pair<std::map<unsigned int, people_graph::vertex_descriptor>::iterator, bool> res =
                remaining_people.insert(std::make_pair(*j, people_graph::vertex_descriptor()));
            if(res.second) {
                res.first->second = gr::add_vertex(pg);
                person& p = pg[res.first->second];
                p.id = res.first->first;
                p.name = email_id.by<bit>().equal_range(p.id).first->second;
            }
        }
    }
    for(connectedness_graph::vertex_iterator i = gr::vertices(cg).first; i != gr::vertices(cg).second; ++i) {
        group& g = cg[*i];
        for(members_t::const_iterator j = g.members.begin(); j != g.members.end(); ++j) {
            members_t::const_iterator k = j;
            for(++k; k != g.members.end(); ++k) {
                //duplicates eliminated by setS type container
                people_graph::edge_descriptor l = gr::add_edge(remaining_people[*j], remaining_people[*k], pg).first;
                edge& e = pg[l];
                e.weight += g.weight;
            }
        }
    }
    fs::path path(save_base);
    if(fs::exists(path))
        fs::remove(path);
    fs::ofstream out(path);

    gr::dynamic_properties dp;
    dp.property("label", get(&person::name, pg));
    dp.property("weight", gr::get(&edge::weight, pg));

    gr::write_graphml(out, pg, gr::get(&person::id, pg), dp, false);
    return 0;
}