// The stdio-JSON lrec-reader is non-streaming: we ingest all records here in the start-of-file hook. // Then in the process method we pop one lrec off the list at a time, until they are all exhausted. // This is in contrast to other Miller lrec-readers. // // It would be possible to extend the streaming framework to also have an end-of-file hook // which we could use here to free parsed-JSON data. However, we simply leverage the start-of-file // hook for the *next* file (if any) or the free method (if not): these free parsed-JSON structures // from the previous file (if any). static void lrec_reader_stdio_json_sof(void* pvstate, void* pvhandle) { lrec_reader_stdio_json_state_t* pstate = pvstate; file_ingestor_stdio_state_t* phandle = pvhandle; json_char* json_input = (json_char*)phandle->sof; json_value_t* parsed_top_level_json; json_char error_buf[JSON_ERROR_MAX]; if (pstate->ptop_level_json_objects != NULL) { for (sllve_t* pe = pstate->ptop_level_json_objects->phead; pe != NULL; pe = pe->pnext) { json_value_t* top_level_json_object = pe->pvvalue; json_value_free(top_level_json_object); } sllv_free(pstate->ptop_level_json_objects); } if (pstate->precords != NULL) { for (sllve_t* pf = pstate->precords->phead; pf != NULL; pf = pf->pnext) { lrec_t* prec = pf->pvvalue; lrec_free(prec); } sllv_free(pstate->precords); } pstate->ptop_level_json_objects = sllv_alloc(); pstate->precords = sllv_alloc(); // This enables us to handle input of the form // // { "a" : 1 } // { "b" : 2 } // { "c" : 3 } // // in addition to // // [ // { "a" : 1 } // { "b" : 2 } // { "c" : 3 } // ] // // This is in line with what jq can handle. In this case, json_parse will return // once for each top-level item and will give us back a pointer to the start of // the rest of the input stream, so we can call json_parse on the rest until it is // all exhausted. json_char* item_start = json_input; int length = phandle->eof - phandle->sof; while (TRUE) { parsed_top_level_json = json_parse(item_start, length, error_buf, &item_start); if (parsed_top_level_json == NULL) { fprintf(stderr, "Unable to parse JSON data: %s\n", error_buf); exit(1); } // The lrecs have their string pointers pointing into the parsed-JSON objects (for // efficiency) so it's important we not free the latter until our free method. reference_json_objects_as_lrecs(pstate->precords, parsed_top_level_json, pstate->json_flatten_separator); if (item_start == NULL) break; if (*item_start == 0) break; length -= (item_start - json_input); json_input = item_start; } }
// The stdio-JSON lrec-reader is non-streaming: we ingest all records here in the start-of-file hook. // Then in the process method we pop one lrec off the list at a time, until they are all exhausted. // This is in contrast to other Miller lrec-readers. // // It would be possible to extend the streaming framework to also have an end-of-file hook // which we could use here to free parsed-JSON data. However, we simply leverage the start-of-file // hook for the *next* file (if any) or the free method (if not): these free parsed-JSON structures // from the previous file (if any). static void lrec_reader_stdio_json_sof(void* pvstate, void* pvhandle) { lrec_reader_stdio_json_state_t* pstate = pvstate; file_ingestor_stdio_state_t* phandle = pvhandle; json_char* json_input = (json_char*)phandle->sof; json_value_t* parsed_top_level_json; json_char error_buf[JSON_ERROR_MAX]; // This enables us to handle input of the form // // { "a" : 1 } // { "b" : 2 } // { "c" : 3 } // // in addition to // // [ // { "a" : 1 } // { "b" : 2 } // { "c" : 3 } // ] // // This is in line with what jq can handle. In this case, json_parse will return // once for each top-level item and will give us back a pointer to the start of // the rest of the input stream, so we can call json_parse on the rest until it is // all exhausted. json_char* item_start = json_input; int length = phandle->eof - phandle->sof; char* detected_line_term = NULL; if (pstate->do_auto_line_term) { // Find the first line-ending sequence (if any): LF or CRLF. for (char* p = phandle->sof; p < phandle->eof; p++) { if (p[0] == '\n') { if (p > phandle->sof && p[-1] == '\r') { detected_line_term = "\r\n"; } else { detected_line_term = "\n"; } break; } } } // Skip comments. For JSON, we ingest the entire blob, this is a matter of finding and iterating over lines. // Miller data comments must be at start of line. if (pstate->comment_handling != COMMENTS_ARE_DATA) { char* line_term = pstate->specified_line_term; if (pstate->do_auto_line_term && detected_line_term != NULL) line_term = detected_line_term; mlr_json_strip_comments(phandle->sof, phandle->eof, pstate->comment_handling, pstate->comment_string, line_term); } mlr_json_end_strip(phandle->sof, &phandle->eof); length = phandle->eof - phandle->sof; if (length > 0) { while (TRUE) { parsed_top_level_json = json_parse(item_start, length, error_buf, &item_start); if (parsed_top_level_json == NULL) { fprintf(stderr, "%s: Unable to parse JSON data: %s\n", MLR_GLOBALS.bargv0, error_buf); exit(1); } // The lrecs have their string pointers pointing into the parsed-JSON objects (for // efficiency) so it's important we not free the latter until our free method. if (!reference_json_objects_as_lrecs(pstate->precords, parsed_top_level_json, pstate->input_json_flatten_separator, pstate->json_array_ingest)) { fprintf(stderr, "%s: Unable to parse JSON data.\n", MLR_GLOBALS.bargv0); exit(1); } if (item_start == NULL) break; if (*item_start == 0) break; length -= (item_start - json_input); json_input = item_start; // json_parse goes up to the '\r' or '\n' (whichever is found first) on the first // parse, then keeps going from there on the next. E.g. in the CRLF case it // consumes the CR at the end of the first read and consumes the LF at the start // of the second, and so on. After the very last parse, we need to here consume // the final '\n' which is (by itself) a parse error. if (length == 1 && *(char*)json_input == '\n') { break; } } } if (detected_line_term != NULL) { pstate->detected_line_term = detected_line_term; } }