/** Loads a file into a GncCsvParseData. This is the first function * that must be called after createing a new GncCsvParseData. If this * fails because the file couldn't be opened, no more functions can be * called on the parse data until this succeeds (or until it fails * because of an encoding guess error). If it fails because the * encoding could not be guessed, gnc_csv_convert_encoding must be * called until it succeeds. * @param parse_data Data that is being parsed * @param filename Name of the file that should be opened * @param error Will contain an error if there is a failure * @return 0 on success, 1 on failure */ int gnc_csv_load_file (GncCsvParseData* parse_data, const char* filename, GError** error) { const char* guess_enc = NULL; /* Get the raw data first and handle an error if one occurs. */ parse_data->raw_mapping = g_mapped_file_new (filename, FALSE, error); if (parse_data->raw_mapping == NULL) { /* TODO Handle file opening errors more specifically, * e.g. inexistent file versus no read permission. */ parse_data->raw_str.begin = NULL; g_clear_error (error); g_set_error (error, 0, GNC_CSV_FILE_OPEN_ERR, "%s", _("File opening failed.")); return 1; } /* Copy the mapping's contents into parse-data->raw_str. */ parse_data->raw_str.begin = g_mapped_file_get_contents (parse_data->raw_mapping); parse_data->raw_str.end = parse_data->raw_str.begin + g_mapped_file_get_length (parse_data->raw_mapping); /* Make a guess at the encoding of the data. */ if (!g_mapped_file_get_length (parse_data->raw_mapping) == 0) guess_enc = go_guess_encoding ((const char*)(parse_data->raw_str.begin), (size_t)(parse_data->raw_str.end - parse_data->raw_str.begin), "UTF-8", NULL); if (guess_enc == NULL) { g_set_error (error, 0, GNC_CSV_ENCODING_ERR, "%s", _("Unknown encoding.")); return 1; } /* Convert using the guessed encoding into parse_data->file_str and * handle any errors that occur. */ gnc_csv_convert_encoding (parse_data, guess_enc, error); if (parse_data->file_str.begin == NULL) { g_set_error (error, 0, GNC_CSV_ENCODING_ERR, "%s", _("Unknown encoding.")); return 1; } else return 0; }
/* Quick and dirty html probe. */ gboolean html_file_probe (G_GNUC_UNUSED GOFileOpener const *fo, GsfInput *input, G_GNUC_UNUSED GOFileProbeLevel pl) { gsf_off_t size = 200; guint8 const* buf = gsf_input_read (input, size, NULL); gchar *ulstr = NULL; GString *ustr; gboolean res = FALSE; /* Avoid seeking in large streams - try to read, fall back if * stream is too short. (Actually, currently _size does not * involve any syscalls -- MW). */ if (!buf) { size = gsf_input_size (input); buf = gsf_input_read (input, size, NULL); if (!buf) return res; } if (go_guess_encoding (buf, size, NULL, &ustr, NULL)) { ulstr = g_utf8_strdown (ustr->str, -1); g_string_free (ustr, TRUE); } if (!ulstr) return res; res = (strstr (ulstr, "<table") != NULL || strstr (ulstr, "<html") != NULL || strstr (ulstr, "<!doctype html") != NULL); g_free (ulstr); return res; }
static gboolean csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl) { /* Rough and ready heuristic. If the first N bytes have no * unprintable characters this may be text */ const gsf_off_t N = 512; if (pl == GO_FILE_PROBE_CONTENT) { guint8 const *header; gsf_off_t i; char const *enc = NULL; GString *header_utf8; char const *p; gboolean ok = TRUE; if (gsf_input_seek (input, 0, G_SEEK_SET)) return FALSE; i = gsf_input_remaining (input); /* If someone ships us an empty file, accept it only if it has a proper name. */ if (i == 0) return csv_tsv_probe (fo, input, GO_FILE_PROBE_FILE_NAME); if (i > N) i = N; if (NULL == (header = gsf_input_read (input, i, NULL))) return FALSE; enc = go_guess_encoding (header, i, NULL, &header_utf8, NULL); if (!enc) return FALSE; for (p = header_utf8->str; *p; p = g_utf8_next_char (p)) { gunichar uc = g_utf8_get_char (p); /* isprint might not be true for these: */ if (uc == '\n' || uc == '\t' || uc == '\r') continue; /* Also, ignore a byte-order mark which may be used to * indicate UTF-8; see * http://en.wikipedia.org/wiki/Byte_Order_Mark for * background. */ if (p == header_utf8->str && uc == 0x0000FEFF) { continue; } if (!g_unichar_isprint (uc)) { ok = FALSE; break; } } g_string_free (header_utf8, TRUE); return ok; } else { char const *name = gsf_input_name (input); if (name == NULL) return FALSE; name = gsf_extension_pointer (name); return (name != NULL && (g_ascii_strcasecmp (name, "csv") == 0 || g_ascii_strcasecmp (name, "tsv") == 0 || g_ascii_strcasecmp (name, "txt") == 0)); } }
/* * stf_read_workbook_auto_csvtab: * @fo: file opener * @enc: optional encoding * @context: command context * @book: workbook * @input: file to read from+convert * * Attempt to auto-detect CSV or tab-delimited file */ static void stf_read_workbook_auto_csvtab (G_GNUC_UNUSED GOFileOpener const *fo, gchar const *enc, GOIOContext *context, GoView *view, GsfInput *input) { Sheet *sheet; Workbook *book; char *name; char *data; GString *utf8data; size_t data_len; StfParseOptions_t *po; const char *gsfname; int cols, rows, i; GStringChunk *lines_chunk; GPtrArray *lines; WorkbookView *wbv = GNM_WORKBOOK_VIEW (view); g_return_if_fail (context != NULL); g_return_if_fail (wbv != NULL); book = wb_view_get_workbook (wbv); data = stf_preparse (context, input, &data_len); if (!data) return; enc = go_guess_encoding (data, data_len, enc, &utf8data, NULL); g_free (data); if (!enc) { go_cmd_context_error_import (GO_CMD_CONTEXT (context), _("That file is not in the given encoding.")); return; } clear_stray_NULs (context, utf8data); /* * Try to get the filename we're reading from. This is not a * great way. */ gsfname = gsf_input_name (input); { const char *ext = gsf_extension_pointer (gsfname); gboolean iscsv = ext && strcasecmp (ext, "csv") == 0; if (iscsv) po = stf_parse_options_guess_csv (utf8data->str); else po = stf_parse_options_guess (utf8data->str); } lines_chunk = g_string_chunk_new (100 * 1024); lines = stf_parse_general (po, lines_chunk, utf8data->str, utf8data->str + utf8data->len); rows = lines->len; cols = 0; for (i = 0; i < rows; i++) { GPtrArray *line = g_ptr_array_index (lines, i); cols = MAX (cols, (int)line->len); } gnm_sheet_suggest_size (&cols, &rows); stf_parse_general_free (lines); g_string_chunk_free (lines_chunk); name = g_path_get_basename (gsfname); sheet = sheet_new (book, name, cols, rows); g_free (name); workbook_sheet_attach (book, sheet); if (stf_parse_sheet (po, utf8data->str, NULL, sheet, 0, 0)) { gboolean is_csv; workbook_recalc_all (book); resize_columns (sheet); if (po->cols_exceeded || po->rows_exceeded) { stf_warning (context, _("Some data did not fit on the " "sheet and was dropped.")); } is_csv = po->sep.chr && po->sep.chr[0] == ','; workbook_set_saveinfo (book, GO_FILE_FL_WRITE_ONLY, go_file_saver_for_id (is_csv ? "Gnumeric_stf:stf_csv" : "Gnumeric_stf:stf_assistant")); } else { workbook_sheet_delete (sheet); go_cmd_context_error_import (GO_CMD_CONTEXT (context), _("Parse error while trying to parse data into sheet")); } stf_parse_options_free (po); g_string_free (utf8data, TRUE); }