/** * Parse raw CSV text using separator and return a list containing all * fields. * * \param contents A pointer to raw CSV contents to parse. * Pointer content will be changed to the end of * parsed line. * \param separator Separator to use. * * \return Parsed list or NULL upon failure (last line). */ GSList * csv_get_next_line ( gchar ** contents, gchar * separator ) { GSList * list; do { list = csv_parse_line ( contents, separator ); } while ( list == GINT_TO_POINTER(-1) ); return list; }
// returns // 0: on success // E_LINE_TOO_WIDE: on line too wide // E_QUOTED_STRING: at least 1 Quoted String is ill-formatted // int csv_parse(FILE *fp, CSV_CB_record_handler cb, void *params) { // char buff[MAX_LINE_LEN]; struct csv_parser_data d; d.callback = cb; d.params = params; while (d.buff[MAX_LINE_LEN - 1] = '*', NULL != fgets(d.buff, MAX_LINE_LEN, fp)) { int r; if (d.buff[MAX_LINE_LEN - 1] == '\0' && d.buff[MAX_LINE_LEN - 2] != '\n') return E_LINE_TOO_WIDE; if (E_QUOTED_STRING == (r = csv_parse_line(&d))) return E_QUOTED_STRING; else if (r != 0) break; } return 0; }
Indexer::Indexer(const string &datapath, const string &dbpath) { // Hardcode field offsets for simplicity. const size_t FIELD_ID_NUMBER = 0; const size_t FIELD_TITLE = 2; const size_t FIELD_DESCRIPTION = 8; // Create or open the database we're going to be writing to. Xapian::WritableDatabase db(dbpath, Xapian::DB_CREATE_OR_OPEN); // Set up a TermGenerator that we'll use in indexing. Xapian::TermGenerator termgenerator; termgenerator.set_stemmer(Xapian::Stem("en")); ifstream csv(datapath.c_str()); vector<string> fields; csv_parse_line(csv, fields); // Check the CSV header line matches our hard-code offsets. if (fields.at(FIELD_ID_NUMBER) != "id_NUMBER" || fields.at(FIELD_TITLE) != "TITLE" || fields.at(FIELD_DESCRIPTION) != "DESCRIPTION") { // The CSV format doesn't match what we expect. cerr << "CSV format has changed!" << endl; exit(1); } while (csv_parse_line(csv, fields)) { // 'fields' is a vector mapping from field number to value. // We look up fields with the 'at' method so we get an exception // if that field isn't set. // // We're just going to use DESCRIPTION, TITLE and id_NUMBER. const string & description = fields.at(FIELD_DESCRIPTION); const string & title = fields.at(FIELD_TITLE); const string & identifier = fields.at(FIELD_ID_NUMBER); // We make a document and tell the term generator to use this. Xapian::Document doc; termgenerator.set_document(doc); // Index each field with a suitable prefix. termgenerator.index_text(title, 1, "S"); termgenerator.index_text(description, 1, "XD"); // Index fields without prefixes for general search. termgenerator.index_text(title); termgenerator.increase_termpos(); termgenerator.index_text(description); // Store all the fields for display purposes. doc.set_data(identifier + "\n" + title + "\n" + description); // We use the identifier to ensure each object ends up in the // database only once no matter how many times we run the // indexer. string idterm = "Q" + identifier; doc.add_boolean_term(idterm); db.replace_document(idterm, doc); } }