/* reader:bytes_consumed() */ static int xmlreader_bytes_consumed(lua_State *L) { xmlreader xr = check_xmlreader(L, 1); long ret = xmlTextReaderByteConsumed(xr); if (ret != -1) { lua_pushinteger(L, ret); return 1; } else { lua_pushnil(L); xmlreader_pusherror(L); return 2; } }
/* * call-seq: * reader.byte_consumed -> value * * This method provides the current index of the parser used by the reader, * relative to the start of the current entity. */ static VALUE rxml_reader_byte_consumed(VALUE self) { return INT2NUM(xmlTextReaderByteConsumed(rxml_text_reader_get(self))); }
/* * call-seq: * reader.byte_consumed -> value * * This method provides the current index of the parser used by the reader, * relative to the start of the current entity. */ static VALUE rxml_reader_byte_consumed(VALUE self) { xmlTextReaderPtr xreader = rxml_text_reader_get(self); return INT2NUM(xmlTextReaderByteConsumed(xreader)); }
/** * filters, transforms, and indexes file using ngrams to the index * * file name - name of file to process * wikiindex - the judy arrays to store the index of the wiki in */ void indexWiki(char* inFileName, Pvoid_t *wikiIndex, int* articleCount) { //-------------------- initialization --------------------// bool articleIndex[lastNgram] = {0}; // boolean array of what trigrams are in an article struct stemmer * currentStemmer = create_stemmer(); // file for writing the titles to FILE* titleFile = NULL; if (writeFiles) { titleFile = fopen("title_file", "w"); if (NULL == titleFile) { fprintf(stderr, "Error open title file: %m\n"); exit(1); } } // initializes the libxml library LIBXML_TEST_VERSION xmlTextReaderPtr wikiReader; //the reader for the document wikiReader = xmlReaderForFile(inFileName, NULL, XML_PARSE_RECOVER+XML_PARSE_NOWARNING+XML_PARSE_NOERROR+XML_PARSE_HUGE); if (NULL == wikiReader) { //fprintf(stderr, "%s %s\n", "Failed to open ", wikiFileName); fprintf(stderr, "Error opening XML wiki: %m\n"); exit(1); } // for progress bar int percent = 0; long fileSize = getFileSize(inFileName); // initialization for currentArticle and its componens article currentArticle; currentArticle.title = g_string_sized_new(256); currentArticle.body = g_string_sized_new(786432); //768*1024 //-------------------- index the wiki --------------------// optionalPrint ("%s", "Adding collection to index.\n"); optionalPrint ("%d", (int)(fileSize / 1048576)); optionalPrint (" MB in file\n"); displayProgressBar (xmlTextReaderByteConsumed(wikiReader), fileSize, &percent); //prime the loop currentArticle.title->len = 0; currentArticle.body->len = 0; xmlTextReaderRead(wikiReader);// at a <page> tag, drop in xmlTextReaderRead(wikiReader);// at a <page> tag, drop in // reads from xml file until file is finished, adds articles to index, and writes tittles to file // processes one article per iteration while (getArticle (wikiReader, ¤tArticle)) { currentArticle.articleNumber = *articleCount; *articleCount = *articleCount + 1; // filter / transform text removeMarkup(currentArticle.body); stemText(currentArticle.body, currentStemmer); //ngramming.h // index the text indexText(currentArticle.body, articleIndex); //ngramming.h addToIndex(articleIndex, wikiIndex, currentArticle.articleNumber); //adds titles to title file if (writeFiles) {fprintf(titleFile, "%s\n", currentArticle.title->str);} //re-prime the loop currentArticle.title->len = 0; currentArticle.body->len = 0; displayProgressBar (xmlTextReaderByteConsumed(wikiReader), fileSize, &percent); } optionalPrint ("\n%s", "Finished indexing. \n"); optionalPrint ("%lu", (long)(xmlTextReaderByteConsumed(wikiReader)/1048576)); optionalPrint ("MB consumed\n"); optionalPrint ("%d %s %d %s", *articleCount, "articles found", (int) currentArticle.body->allocated_len, "length allocated for article body\n"); // clean up of structures needed to process wiki if (writeFiles) {fclose (titleFile);} free_stemmer(currentStemmer); xmlFreeTextReader(wikiReader); xmlCleanupParser(); //g_string_free(currentArticle.title, TRUE); //g_string_free(currentArticle.body, TRUE); //malloc fail if this is uncommented ?! }
static gboolean parse_service (xmlTextReaderPtr reader, AgService *service) { const gchar *name; int ret, type; if (!service->name) { xmlChar *_name = xmlTextReaderGetAttribute (reader, (xmlChar *) "id"); service->name = g_strdup ((const gchar *)_name); if (_name) xmlFree(_name); } ret = xmlTextReaderRead (reader); while (ret == 1) { name = (const gchar *)xmlTextReaderConstName (reader); if (G_UNLIKELY (!name)) return FALSE; type = xmlTextReaderNodeType (reader); if (type == XML_READER_TYPE_END_ELEMENT && strcmp (name, "service") == 0) break; if (type == XML_READER_TYPE_ELEMENT) { gboolean ok; if (strcmp (name, "type") == 0 && !service->type) { ok = _ag_xml_dup_element_data (reader, &service->type); } else if (strcmp (name, "name") == 0 && !service->display_name) { ok = _ag_xml_dup_element_data (reader, &service->display_name); } else if (strcmp (name, "description") == 0) { ok = _ag_xml_dup_element_data (reader, &service->description); } else if (strcmp (name, "provider") == 0 && !service->provider) { ok = _ag_xml_dup_element_data (reader, &service->provider); } else if (strcmp (name, "icon") == 0) { ok = _ag_xml_dup_element_data (reader, &service->icon_name); } else if (strcmp (name, "translations") == 0) { ok = _ag_xml_dup_element_data (reader, &service->i18n_domain); } else if (strcmp (name, "template") == 0) { ok = parse_template (reader, service); } else if (strcmp (name, "preview") == 0) { ok = parse_preview (reader, service); } else if (strcmp (name, "type_data") == 0) { static const gchar element[] = "<type_data"; gsize offset; /* find the offset in the file where this element begins */ offset = xmlTextReaderByteConsumed(reader); while (offset > 0) { if (strncmp (service->file_data + offset, element, sizeof (element)) == 0) { service->type_data_offset = offset; break; } offset--; } /* this element is placed after all the elements we are * interested in: we can stop the parsing now */ return TRUE; } else if (strcmp (name, "tags") == 0) { ok = _ag_xml_parse_element_list (reader, "tag", &service->tags); } else ok = TRUE; if (G_UNLIKELY (!ok)) return FALSE; } ret = xmlTextReaderNext (reader); } return TRUE; }