예제 #1
0
int
do_import(std::string file, int sorted, uint_t limit, 
          int &rnadded, int &rnlines) {
#if defined USE_CXX_IO
    std::ifstream fin(file.c_str());
#else
    FILE *fin = fopen(file.c_str(), "r");
#endif

    int fd = open(file.c_str(), O_RDONLY);

    // Potential race condition + not checking for return value
    if_length = file_size(file.c_str());

    DCERR("handle_import::file:"<<file<<endl);

    if (!fin || !fd) {
        return -IMPORT_FILE_NOT_FOUND;
    }
    else {
        building = true;
        int nlines = 0;
        int foffset = 0;

        if (if_mmap_addr) {
            munmap(if_mmap_addr, if_length);
        }

        // mmap() the input file in
        if_mmap_addr = (char*)mmap(NULL, if_length, PROT_READ, MAP_SHARED, fd, 0);
        if (!if_mmap_addr) {
            fclose(fin);
            close(fd);
            return -IMPORT_FILE_NOT_FOUND;
        }

        pm.repr.clear();
        char buff[INPUT_LINE_SIZE];

        while (
#if defined USE_CXX_IO
               fin
#else
            !feof(fin)
#endif
               && limit--) {

            buff[0] = '\0';

#if defined USE_CXX_IO
            fin.getline(buff, INPUT_LINE_SIZE);
            const int llen = fin.gcount();
            buff[INPUT_LINE_SIZE - 1] = '\0';
#else
            char *got = fgets(buff, INPUT_LINE_SIZE, fin);
            if (!got) {
                break;
            }
            const int llen = strlen(buff);
            if (llen && buff[llen-1] == '\n') {
                buff[llen-1] = '\0';
            }
#endif

            ++nlines;

            int weight = 0;
            std::string phrase;
            StringProxy snippet;
            InputLineParser(if_mmap_addr, foffset, buff, &weight, &phrase, &snippet).start_parsing();

            foffset += llen;

            if (!phrase.empty()) {
                str_lowercase(phrase);
                DCERR("Adding: "<<weight<<", "<<phrase<<", "<<std::string(snippet)<<endl);
                pm.insert(weight, phrase, snippet);
            }
        }

        fclose(fin);
        pm.finalize(sorted);
        vui_t weights;
        for (size_t i = 0; i < pm.repr.size(); ++i) {
            weights.push_back(pm.repr[i].weight);
        }
        st.initialize(weights);

        rnadded = weights.size();
        rnlines = nlines;

        building = false;
    }

    return 0;
}