deck_t *new_deck() { deck_t *x = malloc(sizeof(deck_t)); x->header = NULL; x->slide = new_slide(); x->slides = x->headers = 0; return x; }
slide_t *next_slide(slide_t *prev) { slide_t *x = new_slide(); x->prev = prev; prev->next = x; return x; }
extern int process_pptx(char *pptx_file) { // Returns the number of slides, -1 in case of severe failure // (if the file doesn't exist, 0 is returned) mz_bool status; mz_zip_archive zip_archive; char *xml_slide; size_t xml_sz; char *p; char *q; char *s; char *t; int i; int deckid; int maxslide = 0; int notenum; int slidenum = 0; int slideid; char xmlrel[FILENAME_MAX]; short kw; short level; memset(&zip_archive, 0, sizeof(zip_archive)); status = mz_zip_reader_init_file(&zip_archive, pptx_file, 0); if (!status) { fprintf(stderr, "Cannot read %s!\n", pptx_file); } else { deckid = new_deck(pptx_file); // Get and print information about each file in the archive. for (i = 0; i < (int)mz_zip_reader_get_num_files(&zip_archive); i++) { mz_zip_archive_file_stat file_stat; if (!mz_zip_reader_file_stat(&zip_archive, i, &file_stat)) { fprintf(stderr, "mz_zip_reader_file_stat() failed!\n"); mz_zip_reader_end(&zip_archive); return -1; } if (!mz_zip_reader_is_file_a_directory(&zip_archive, i)) { if ((strncmp(file_stat.m_filename, "ppt/notesSlides/", 16) == 0) && strncmp(file_stat.m_filename, "ppt/notesSlides/_rels/", 22)) { // Notes // // *** ACHTUNG *** // Note numbers don't match the corresponding slide number, // that would be too easy. You have to dig into "_rels" // if (!sscanf(&(file_stat.m_filename[26]), "%d", ¬enum)) { fprintf(stderr, "Failed to extract note num from %s\n", &(file_stat.m_filename[16])); return -1; } sprintf(xmlrel, "ppt/notesSlides/_rels/notesSlide%d.xml.rels", notenum); if ((xml_slide = (char *)mz_zip_reader_extract_file_to_heap(&zip_archive, (const char *)xmlrel, &xml_sz, (mz_uint)0)) != (char *)NULL) { // Try to find the b****y slide number if ((p = strstr(xml_slide, "Target=\"../slides/slide")) != NULL) { if (!sscanf(&(p[23]), "%d", &slidenum)) { fprintf(stderr, "Failed to extract slidenum from %s\n", &(p[36])); return -1; } } free(xml_slide); } if (slidenum) { if (slidenum > maxslide) { maxslide = slidenum; } slideid = new_slide(deckid, slidenum); xml_sz = 0; if ((xml_slide = (char *)mz_zip_reader_extract_file_to_heap(&zip_archive, (const char *)file_stat.m_filename, &xml_sz, (mz_uint)0)) != (char *)NULL) { if (get_mode() & OPT_TAGS) { // Exclusively look for indexing words in the notes q = xml_slide; level = 0; while ((p = strchr(q, '[')) != (char *)NULL) { level++; do { p++; } while (isspace(*p)); q = p + 1; while (*q && ((*q != ']') || (level > 1))) { switch (*q) { case '[': level++; break; case ']': level--; break; default: break; } q++; } if (*q == ']') { *q = '\0'; if (get_sep() == ';') { // Replace HTML entities if there are any replace_entities(p); } while ((s = strchr(p, get_sep())) != (char *)NULL) { *s++ = '\0'; if (*p == get_kw()) { p++; kw = 1; } else { if (autokw()) { kw = lowercase_word(p); } else { kw = 0; } } if ((t = cleanup(p)) != (char *)NULL) { new_word_as_is(t, slideid, 'T', kw); free(t); } p = s; while (isspace(*p)) { p++; } } if (*p == get_kw()) { p++; kw = 1; } else { if (autokw()) { kw = lowercase_word(p); } else { kw = 0; } } if ((t = cleanup(p)) != (char *)NULL) { new_word_as_is(t, slideid, 'T', kw); free(t); } p = 1 + q; } else { break; } } } else { // Analyze text analyze_text(slideid, xml_slide, 'N'); } free(xml_slide); } else { fprintf(stderr, "Extract flopped\n"); return -1; } } } // We look at regular slides even if we are only interested // in tags to check that we aren't missing any slide without // notes and that our tag count is correct if ((strncmp(file_stat.m_filename, "ppt/slides/", 11) == 0) && strncmp(file_stat.m_filename, "ppt/slides/_rels/", 17)) { // Regular slide if (!sscanf(&(file_stat.m_filename[16]), "%d", &slidenum)) { fprintf(stderr, "Failed to extract num from %s\n", &(file_stat.m_filename[11])); return -1; } if (slidenum > maxslide) { maxslide = slidenum; } slideid = new_slide(deckid, slidenum); if (!(get_mode() & OPT_TAGS)) { xml_sz = 0; if ((xml_slide = (char *)mz_zip_reader_extract_file_to_heap(&zip_archive, (const char *)file_stat.m_filename, &xml_sz, (mz_uint)0)) != (char *)NULL) { // Analyze text analyze_text(slideid, xml_slide, 'S'); // Analyze images analyze_pic(slideid, xml_slide); free(xml_slide); } else { fprintf(stderr, "Extract flopped\n"); return -1; } } } } } // Close the archive, freeing any resources it was using mz_zip_reader_end(&zip_archive); } return maxslide; }