int do_import(std::string file, int sorted, uint_t limit, int &rnadded, int &rnlines) { #if defined USE_CXX_IO std::ifstream fin(file.c_str()); #else FILE *fin = fopen(file.c_str(), "r"); #endif int fd = open(file.c_str(), O_RDONLY); // Potential race condition + not checking for return value if_length = file_size(file.c_str()); DCERR("handle_import::file:"<<file<<endl); if (!fin || !fd) { return -IMPORT_FILE_NOT_FOUND; } else { building = true; int nlines = 0; int foffset = 0; if (if_mmap_addr) { munmap(if_mmap_addr, if_length); } // mmap() the input file in if_mmap_addr = (char*)mmap(NULL, if_length, PROT_READ, MAP_SHARED, fd, 0); if (!if_mmap_addr) { fclose(fin); close(fd); return -IMPORT_FILE_NOT_FOUND; } pm.repr.clear(); char buff[INPUT_LINE_SIZE]; while ( #if defined USE_CXX_IO fin #else !feof(fin) #endif && limit--) { buff[0] = '\0'; #if defined USE_CXX_IO fin.getline(buff, INPUT_LINE_SIZE); const int llen = fin.gcount(); buff[INPUT_LINE_SIZE - 1] = '\0'; #else char *got = fgets(buff, INPUT_LINE_SIZE, fin); if (!got) { break; } const int llen = strlen(buff); if (llen && buff[llen-1] == '\n') { buff[llen-1] = '\0'; } #endif ++nlines; int weight = 0; std::string phrase; StringProxy snippet; InputLineParser(if_mmap_addr, foffset, buff, &weight, &phrase, &snippet).start_parsing(); foffset += llen; if (!phrase.empty()) { str_lowercase(phrase); DCERR("Adding: "<<weight<<", "<<phrase<<", "<<std::string(snippet)<<endl); pm.insert(weight, phrase, snippet); } } fclose(fin); pm.finalize(sorted); vui_t weights; for (size_t i = 0; i < pm.repr.size(); ++i) { weights.push_back(pm.repr[i].weight); } st.initialize(weights); rnadded = weights.size(); rnlines = nlines; building = false; } return 0; }
int do_import(std::string file, uint_t limit, int &rnadded, int &rnlines) { bool is_input_sorted = true; #if defined USE_CXX_IO std::ifstream fin(file.c_str()); #else FILE *fin = fopen(file.c_str(), "r"); #endif int fd = open(file.c_str(), O_RDONLY); DCERR("handle_import::file:" << file << "[fin: " << (!!fin) << ", fd: " << fd << "]" << endl); if (!fin || fd == -1) { perror("fopen"); return -IMPORT_FILE_NOT_FOUND; } else { building = true; int nlines = 0; int foffset = 0; if (if_mmap_addr) { int r = munmap(if_mmap_addr, if_length); if (r < 0) { perror("munmap"); building = false; return -IMPORT_MUNMAP_FAILED; } } // Potential race condition + not checking for return value if_length = file_size(file.c_str()); // mmap() the input file in if_mmap_addr = (char*)mmap(NULL, if_length, PROT_READ, MAP_SHARED, fd, 0); if (if_mmap_addr == MAP_FAILED) { fprintf(stderr, "length: %llu, fd: %d\n", if_length, fd); perror("mmap"); if (fin) { fclose(fin); } if (fd != -1) { close(fd); } building = false; return -IMPORT_MMAP_FAILED; } pm.repr.clear(); char buff[INPUT_LINE_SIZE]; std::string prev_phrase; while (!is_EOF(fin) && limit--) { buff[0] = '\0'; int llen = -1; get_line(fin, buff, INPUT_LINE_SIZE, llen); if (llen == -1) { break; } ++nlines; int weight = 0; std::string phrase; StringProxy snippet; InputLineParser(if_mmap_addr, foffset, buff, &weight, &phrase, &snippet).start_parsing(); foffset += llen; if (!phrase.empty()) { str_lowercase(phrase); DCERR("Adding: " << weight << ", " << phrase << ", " << std::string(snippet) << endl); pm.insert(weight, phrase, snippet); } if (is_input_sorted && prev_phrase <= phrase) { prev_phrase.swap(phrase); } else if (is_input_sorted) { is_input_sorted = false; } } DCERR("Creating PhraseMap::Input is " << (!is_input_sorted ? "NOT " : "") << "sorted\n"); fclose(fin); pm.finalize(is_input_sorted); vui_t weights; for (size_t i = 0; i < pm.repr.size(); ++i) { weights.push_back(pm.repr[i].weight); } st.initialize(weights); rnadded = weights.size(); rnlines = nlines; building = false; } return 0; }