int bbdocument_convert(char filetype[],char document[],const int dokument_size, buffer *outbuffer, const char titlefromadd[], char *subname, char *documenturi, unsigned int lastmodified, char *acl_allow, char *acl_denied, struct hashtable **metahash) { FILE *filconfp=NULL; char filconvertetfile_real[216] = ""; char filconvertetfile_out_txt[216] = ""; char filconvertetfile_out_html[216] = ""; int exeocbuflen; int i; char *documentfinishedbuftmp; char fileconverttemplate[1024]; struct fileFilterFormat *fileFilter = NULL; #ifdef DEBUG_TIME struct timeval start_time, end_time; #endif printf("bbdocument_convert: dokument_size %i, title \"%s\",filetype \"%s\"\n",dokument_size,titlefromadd,filetype); //konverterer filnavn til liten case for (i=0;i < strlen(filetype);i++) { //printf("%c\n",filetype[i]); filetype[i] = btolower(filetype[i]); } //hvis vi har et html dokument kan vi bruke dette direkte //er dog noe uefektist her, ved at vi gjør minnekopiering if ((strcmp(filetype,"htm") == 0) || (strcmp(filetype,"html") == 0 )) { if (titlefromadd[0]=='\0') { bmemcpy(outbuffer, document, dokument_size); } else { // Noen dokumenter kan ha lagt ved tittel ved add uten å ha tittel i html-en (f.eks epost). // Legg til korrekt tittel i dokumentet. // Html-parseren tar kun hensyn til den første tittelen, så det skal holde å legge den til // øverst i dokumentet. bprintf(outbuffer, "<title>%s</title>\n", titlefromadd); bmemcpy(outbuffer, document, dokument_size); } return 1; } else if (strcmp(filetype,"hnxt") == 0) { ntobr(document, dokument_size); bprintf(outbuffer, html_tempelate, titlefromadd, document); return 1; } #ifdef DEBUG printf("strcmp done\n"); #endif struct fileFilterFormat *fileFilterOrginal; if (NULL == (fileFilterOrginal = hashtable_search(h_fileFilter,filetype) )) { printf("don't have converter for \"%s\"\n",filetype); #ifdef DEBUG printf("writing to unknownfiltype.log\n"); FILE *fp; if ((fp = fopen(bfile("logs/unknownfiltype.log"),"ab")) == NULL) { perror(bfile("logs/unknownfiltype.log")); } else { printf("title %s\n",titlefromadd); printf("filetype %s\n",filetype); fprintf(fp,"%s: %s\n",titlefromadd,filetype); fclose(fp); } printf("writing to unknownfiltype.log. done\n"); #endif return 0; } //hvis dette er en fil av type text trenger vi ikke og konvertere den. if (strcmp((*fileFilterOrginal).format,"text") == 0) { #ifdef DEBUG printf("fileFilter ses it is a file of format text. Can use it direktly\n"); #endif char *cpbuf; int cpbufsize; //konvertere alle \n til <br> cpbufsize = (dokument_size + 512 +1); cpbuf = malloc(cpbufsize); memcpy(cpbuf,document,dokument_size); cpbuf[dokument_size] = '\0'; //stripper < og > tegn, da html parseren vil tro det er html tagger. //de er jo som kjent på formater < og > stripTags(cpbuf,dokument_size); #ifdef DEBUG printf("document %i\n",strlen(document)); #endif bprintf(outbuffer, html_text_tempelate, titlefromadd, cpbuf); //printf("documentfinishedbuf %i\n", buffer_length(outbuffer)); free(cpbuf); return 1; } //vi må lage en kopi av filfilter infoen, da vi skal endre den. fileFilter = malloc(sizeof(struct fileFilterFormat)); memcpy(fileFilter, fileFilterOrginal, sizeof(struct fileFilterFormat)); #ifdef DEBUG printf("have converter for file type\n"); #endif /***************************************************************************** Vi har konverter. Må skrive til fil får å kunne sende den med *****************************************************************************/ pid_t pid = getpid(); sprintf(fileconverttemplate, "%s-%d", filconvertetfile, rand()); sprintf(filconvertetfile_real,"%s-%u.%s",fileconverttemplate, (unsigned int)pid,filetype); sprintf(filconvertetfile_out_txt,"%s-%u.txt",fileconverttemplate, (unsigned int)pid); sprintf(filconvertetfile_out_html,"%s-%u.html",fileconverttemplate, (unsigned int)pid); #ifdef DEBUG printf("bbdocument_convert: filconvertetfile_real \"%s\"\n",filconvertetfile_real); #endif if ((filconfp = fopen(filconvertetfile_real,"wb")) == NULL) { perror(filconvertetfile_real); exit(1); } flock(fileno(filconfp),LOCK_EX); fwrite(document,1,dokument_size,filconfp); fclose(filconfp); //reåpner den read only, ogi lager en delt lås på filen, slik at vi ungår at perl /tmp watch sletter den. if ((filconfp = fopen(filconvertetfile_real,"rb")) == NULL) { perror(filconvertetfile_real); exit(1); } flock(fileno(filconfp),LOCK_SH); //convert to text. /*****************************************************************************/ strsandr((*fileFilter).command,"#file",filconvertetfile_real); strsandr((*fileFilter).command,"#outtxtfile",filconvertetfile_out_txt); strsandr((*fileFilter).command,"#outhtmlfile",filconvertetfile_out_html); exeocbuflen = (dokument_size * 2) + 513; // XXX: Find a better way //(*documentfinishedbufsize); if ((documentfinishedbuftmp = malloc(exeocbuflen)) == NULL) { perror("Can't malloc documentfinishedbuftmp"); return 0; } switch (fileFilter->filtertype) { case FILTER_EXEOC: run_filter_exeoc( documentfinishedbuftmp, exeocbuflen, fileFilter, metahash ); break; case FILTER_PERL_PLUGIN: run_filter_perlplugin( documentfinishedbuftmp, exeocbuflen , fileFilter, metahash ); break; default: errx(1, "Unknown filtertype '%d'", fileFilter->filtertype); } #ifdef USE_LIBEXTRACTOR if (fileFilter->attrwhitelist != NULL) add_libextractor_attr(metahash, filconvertetfile_real, fileFilter->attrwhitelist); #endif //<<<<<<< bbdocument.c //======= //her parser vi argumenter selv, og hver space blir en ny argyment, selv om vi //bruker "a b", som ikke riktig blir to argumenter her, a og b //splitter på space får å lage en argc /*TokCount = split((*fileFilter).command, " ", &splitdata); //#ifdef DEBUG printf("splitet comand in %i, program is \"%s\"\n",TokCount,splitdata[0]); //#endif printf("running: %s\n",(*fileFilter).command); //sender med størelsen på buferen nå. Vil få størelsen på hva vi leste tilbake char *execobuf = malloc(exeocbuflen); //>>>>>>> 1.64 //<<<<<<< bbdocument.c //======= char *envpairpath = strdup(/tmp/converter-metadata-XXXXXX); char envpair[PATH_MAX]; mktemp(envpairpath); sprintf(envpair, "SDMETAFILE=%s", envpairpath); free(envpairpath); envpairpath = envpair + strlen("SDMETAFILE="); char *shargs[] = { "/usr/bin/env", NULL, "/bin/sh", "-c", NULL, NULL, }; shargs[1] = envpair; shargs[4] = fileFilter->command; #ifdef DEBUG_TIME gettimeofday(&start_time, NULL); #endif if (!exeoc_timeout(shargs, execobuf, &exeocbuflen, &ret, 120)) { printf("dident get any data from exeoc. But can be a filter that creates files, sow we wil continue\n"); execobuf[0] = '\0'; exeocbuflen = 0; } #ifdef DEBUG_TIME gettimeofday(&end_time, NULL); printf("Time debug: exeoc_timeout() time: %f\n",getTimeDifference(&start_time, &end_time)); #endif */ /* if (metahash) { FILE *metafp; *metahash = create_hashtable(3, ht_stringhash, ht_stringcmp); if ((metafp = fopen(envpairpath, "r")) != NULL) { char *key, *value, line[2048]; while (fgets(line, sizeof(line), metafp)) { char *p, *p2; // Comment if (line[0] == '#') continue; key = line; p = strchr(key, '='); if (p == NULL) { fprintf(stderr, "Invalid format on meta spec file: %s\n", line); continue; } p2 = p; while (isspace(*(p2-1))) p2--; *p2 = '\0'; p++; // Skip past = while (isspace(*p)) p++; value = p; while (isspace(*key)) key++; if (value[strlen(value)-1] == '\n') value[strlen(value)-1] = '\0'; printf("Got pair: %s = %s\n", key, value); hashtable_insert(*metahash, strdup(key), strdup(value)); } fclose(metafp); unlink(envpairpath); } else { printf("Couldn't open %s\n", envpairpath); } } */ //>>>>>>> 1.64 #ifdef DEBUG //printf("did convert to %i bytes (strlen %i)\n",exeocbuflen,strlen(documentfinishedbuftmp)); #endif if (strcmp((*fileFilter).outputformat,"text") == 0) { //stripper < og > tegn, da html parseren vil tro det er html tagger. //de er jo som kjent på formater < og > stripTags(documentfinishedbuftmp,strlen(documentfinishedbuftmp)); bprintf(outbuffer, html_text_tempelate,titlefromadd,documentfinishedbuftmp); } else if (strcmp((*fileFilter).outputformat,"html") == 0) { //html trenger ikke å konvertere //dette er altså outputformat html. Ikke filtype outputformat. Filtupe hondteres lengere oppe //ToDo: må vel kopiere inn noe data her??? bprintf(outbuffer, "%s", documentfinishedbuftmp); // Ved filkonvertering vil tittelen som sendes med (from add) være filnavnet. // Den vil vi kun bruke dersom dokumentet i seg selv ikke har en tittel. // Derfor legges den tittelen til nederst i dokumentet: bprintf(outbuffer, "<title>%s</title>\n", titlefromadd); } else if (strcmp((*fileFilter).outputformat,"textfile") == 0) { FILE *fh; struct stat inode; char *cpbuf; printf("filconvertetfile_out_txt: \"%s\"\n",filconvertetfile_out_txt); if ((fh = fopen(filconvertetfile_out_txt,"rb")) == NULL) { printf("can't open out file \"%s\"\n",filconvertetfile_out_txt); perror(filconvertetfile_out_txt); goto bbdocument_convert_error; } fstat(fileno(fh),&inode); if ((cpbuf = malloc(inode.st_size +1)) == NULL) { perror("malloc"); goto bbdocument_convert_error; } fread(cpbuf,1,inode.st_size,fh); cpbuf[inode.st_size] = '\0'; printf("did read back %i bytes from file \"%s\"\n",(int)inode.st_size,filconvertetfile_out_txt); printf("strlen cpbuf: %i\n",strlen(cpbuf)); //stripper < og > tegn, da html parseren vil tro det er html tagger. //de er jo som kjent på formater < og > stripTags(cpbuf,inode.st_size); fclose(fh); //printf("have size %i\n",(*documentfinishedbufsize)); bprintf(outbuffer, html_text_tempelate, titlefromadd, cpbuf); free(cpbuf); //seltter filen vi lagde unlink(filconvertetfile_out_txt); } else if (strcmp((*fileFilter).outputformat,"htmlfile") == 0) { FILE *fh; struct stat inode; size_t n; char buf[4096]; if ((fh = fopen(filconvertetfile_out_html,"rb")) == NULL) { printf("can't open out file \"%s\"\n",filconvertetfile_out_html); perror(filconvertetfile_out_html); goto bbdocument_convert_error; } fstat(fileno(fh),&inode); #if 0 if ((*documentfinishedbufsize) > inode.st_size) { (*documentfinishedbufsize) = inode.st_size; } #endif while ((n = fread(buf, 1, sizeof(buf)-1, fh)) > 0) { bmemcpy(outbuffer, buf, n); } fclose(fh); unlink(filconvertetfile_out_html); bprintf(outbuffer, "<title>%s</title>\n", titlefromadd); } else if (strcmp(fileFilter->outputformat, "dir") == 0 || strcmp(fileFilter->outputformat, "diradd") == 0) { char *p, *pstart; /* Does len do anything any more? */ int len, failed = 0; int type; /* 1 for dir, 2 for diradd */ type = (strcmp(fileFilter->outputformat, "dir") == 0) ? 1 : 2; len = exeocbuflen; p = strdup(documentfinishedbuftmp); pstart = p; if (p == NULL) { goto bbdocument_convert_error; } bprintf(outbuffer, html_text_tempelate, titlefromadd, ""); while (*p != '\0') { char *ft, *path; char *part = NULL; ft = p; for (; *p != ' '; p++) len--; *p = '\0'; if (type == 2) { part = ++p; for (; *p != ' '; p++) len--; *p = '\0'; } path = ++p; /* XXX: strchr() */ for (; *p != '\n'; p++) len--; if (*p == '\n') *p++ = '\0'; /* We have a new file, let's get to work on it */ //printf("########Got: %s: %s\n", ft, path); { char *docbuf; int docbufsize; struct stat st; FILE *fp; if (stat(path, &st) == -1) { /* Unable to access file, move on to the next */ fprintf(stderr, "File: %s\n", path); perror("stat"); failed++; continue; } docbuf = malloc(st.st_size + 1); /* Make room for our lovely '\0' */ if (docbuf == NULL) { perror("malloc"); failed++; free(docbuf); continue; } docbufsize = st.st_size; if ((fp = fopen(path, "r")) == NULL) { perror("fopen"); failed++; free(docbuf); continue; } fread(docbuf, 1, docbufsize, fp); fclose(fp); unlink(path); docbuf[docbufsize] = '\0'; //runarb: 18 jan 2008: har var titel "", ikke titlefromadd, som gjorde at 24so crawling mistet titler. if (bbdocument_convert(ft, docbuf, docbufsize, outbuffer, titlefromadd, subname, documenturi, lastmodified, acl_allow, acl_denied, NULL) == 0) { fprintf(stderr, "Failed on bbdocument_convert.\n"); failed++; free(docbuf); continue; } free(docbuf); } } if (type == 2) { assert(0); #if 0 *documentfinishedbufsize = 1; *documentfinishedbuf = strdup("."); #endif } //printf("Got this: %d %d<<\n%s\n", strlen(*documentfinishedbuf), *documentfinishedbufsize, *documentfinishedbuf); free(pstart); } else { printf("unknown dokument outputformat \"%s\"\n",fileFilter->outputformat); free(documentfinishedbuftmp); goto bbdocument_convert_error; } free(documentfinishedbuftmp); unlink(filconvertetfile_real); unlink(filconvertetfile_out_txt); unlink(filconvertetfile_out_html); fclose(filconfp); #ifndef DEBUG //runarb: 13okr2007: hvorfor ver denne komentert ut? Det hoper seg opp med filer //unlink(filconvertetfile_real); #endif //printf("documentfinishedbuf is: \n...\n%s\n...\n", *documentfinishedbuf); free(fileFilter); return 1; bbdocument_convert_error: if (filconvertetfile_real[0] != '\0') { unlink(filconvertetfile_real); } if (filconvertetfile_out_txt[0] != '\0') { unlink(filconvertetfile_out_txt); } if (filconvertetfile_out_html[0] != '\0') { unlink(filconvertetfile_out_html); } if (filconfp != NULL) { fclose(filconfp); } if (fileFilter != fileFilter) { free(fileFilter); } if (fileFilter != NULL) { free(fileFilter); } return 0; }
BOOL getTagsFromKoshian( char **tags, /* (O) タグ候補 */ int *curNum, /* (I/O) 今までに取得済みのタグ候補の数 */ int numOfTags, /* (I) 取得しようとしているタグ候補の数 */ char *response, /* (O) 受信バッファ */ char *response2, /* (O) 受信バッファ */ size_t sz, /* (I) 受信バッファのサイズ */ BOOL striped /* (I) すでに stripTags 済みか否か */ ) { static int statZuzara = TRUE; int num = *curNum; char *p, *q; char targetURL[MAX_URLLENGTH]; char *request; size_t sz2 = sz * 16; /* * 文章からキーワードを抜き出すAPI: KOSHIAN * (詳細: http://blog.zuzara.com/2006/12/10/171/) * を使って、特徴語(最大20個)を抽出する */ request = (char *)malloc( sz2 ); if ( request ) { if ( !striped ) { p = any2utf( response2 ); q = stripTags(p ? p : response2); if ( strlen( q ) >= sz2 - 3 ) q[sz2 - 3] = NUL; } else q = response2; sprintf( request, "q=%s", encodeURL( q ) ); setUpReceiveBuffer( response, sz ); strcpy( targetURL, "http://zuzara.dyndns.org/api/koshian" ); http_post( targetURL, "application/x-www-form-urlencoded", request, response ); if ( *response ) { BOOL found; int i; p = strstr( utf2sjisEx(response), "<Result " ); while ( p ) { p += 8; q = strchr( p, '>' ); if ( !q ) break; p = q + 1; q = strstr( p, "</Result>" ); if ( !q ) break; strncpy( tags[num], p, q - p ); tags[num][q - p] = NUL; normalizeTag( tags[num] ); found = FALSE; for ( i = 0; i < num; i++ ) { if ( !strcmp( tags[i], tags[num] ) ) { found = TRUE; break; } } if ( found == FALSE ) { num++; if ( num >= numOfTags ) break; } q += 9; p = strstr( q, "<Result " ); } } else statZuzara = FALSE; free( request ); } return ( statZuzara ); }
void extractTags( const char *url, /* (I) 当該WebページのURL */ char **tags, /* (O) タグ候補 */ char **hatenaTags,/* (I/O) はてブから取得したタグ候補 */ int *numOfTags, /* (I/O) タグ候補の数 */ BOOL *stat1470net,/* (I/O) 1470.net の稼働状況 */ const char *yahooAppID /* (I) Yahoo! アプリケーションID */ ) { char *response; int num = 0; // size_t sz = MAX_CONTENT_SIZE * 20; size_t sz = MAX_CONTENT_SIZE * 2; static int statZuzara = TRUE; /* 当該Webページのはてなブックマークページからタグを取得 */ if ( hatenaTags && *hatenaTags ) { getTagsFromHatenaBookmark( tags, hatenaTags, &num, *numOfTags ); if ( num >= *numOfTags ) { *numOfTags = num; return; } } response = (char *)malloc( sz ); if ( !response ) return; /* 1470.net の「タグ候補抽出API」を使ってタグ候補を取得 */ if ( *stat1470net ) *stat1470net = getTagsFrom1470net( url, tags, hatenaTags, &num, *numOfTags, response, sz ); if ( num < *numOfTags ) { char *response2; response2 = (char *)malloc( sz ); if ( response2 ) { // 解析対象ページ(ブックマークしようとしているページ)を取得 setUpReceiveBuffer( response2, sz ); http_get( url, response2 ); /* 各種 API を利用して、特徴語を抽出 */ if ( *response2 ) { size_t sz2 = strlen(response2) * 16; char *buf = (char *)malloc( sz2 ); if ( buf ) { char *p = any2utf( response2 ); char *q = stripTags(p ? p : response2); if ( strlen( q ) >= sz2 - 3 ) q[sz2 - 3] = NUL; strcpy( buf, q ); } /* Yahoo! Japan 形態素解析 API を利用して、特徴語を抽出 */ if ( buf ) getTagsFromYahooAPI( buf, tags, &num, *numOfTags, yahooAppID ); /* はてなダイアリーキーワードリンクAPI */ if ( num < *numOfTags ) getHatenaKeywordLink( buf ? buf : response2, tags, &num, *numOfTags, buf ? TRUE : FALSE ); /* KOSHIAN */ if ( statZuzara && (num < *numOfTags) ) statZuzara = getTagsFromKoshian( tags, &num, *numOfTags, response, buf ? buf : response2, sz, buf ? TRUE : FALSE ); #ifdef USE_BULKFEEDS_API /* Bulkfeeds API の「形態素解析+特徴語抽出」 */ if ( num < *numOfTags ) getTagsFromBulkfeeds( buf ? buf : response2, tags, &num, *numOfTags, buf ? TRUE : FALSE ); #endif /* USE_BULKFEEDS_API */ if ( buf ) free( buf ); } free( response2 ); } } *numOfTags = num; free( response ); }
void getHatenaKeywordLink( const char *body, /* (I) 解析対象 */ char **tags, /* (O) タグ候補 */ int *curNum, /* (I/O) 今までに取得済みのタグ候補の数 */ int numOfTags, /* (I) 取得しようとしているタグ候補の数 */ BOOL striped /* (I) すでに stripTags 済みか否か */ ) { /* キーワードリンク埋め込み */ char *p, *result; size_t sz = ((body && *body) ? strlen( body ) : MAX_CONTENT_SIZE) * 32; result = (char *)malloc( sz ); if ( result ) { const char *q = body; if ( !striped ) { p = any2utf( body ); q = stripTags( p ? p : body ); } memset( result, 0x00, sz ); p = setKeywordLink( q, result, sz, 0, CNAME_GENERAL, "_blank", "hatena" ); if ( p && *p && strcmp( p, "(null)" ) ) { if ( !strncmp( p, ";lt=;lt=", 8 ) ) { /* 文字化け検出 */ free( result ); return; } else utf2sjisEx( p ); } else { #if 0 /* はてなダイアリーキーワード自動リンクAPIが */ /* 機能していない場合 */ char *q; size_t len = 0; memset( result, 0x00, sz ); p = changeHatenaKeyword( body, result, MODE_HTML ); q = euc2sjis( p ); if ( !q ) q = any2sjis( p ); if ( q && *q ) len = strlen( q ); if ( len >= sz - 1 ) { strncpy( result, q, sz - 1 ); result[sz - 1] = NUL; } else if ( len > 0 ) strcpy( result, q ); #else /* http://d.hatena.ne.jp/images/keyword/keywordlist のフォー */ /* マットが変更されているため、changeHatenaKeyword() がうまく */ /* 機能しなくなっている (変更時期不明) */ *result = NUL; /* 暫定対処 */ #endif } if ( *result ) { /* キーワードを抜き出す */ BOOL found; char *q; int i; int num = *curNum; p = any2sjis( result ); p = strstr( p ? p : result, "<a class=\"hatena\" " ); while ( p ) { p += 18; q = strchr( p, '>' ); if ( !q ) break; p = q + 1; q = strstr( p, "</a>" ); if ( !q ) break; strncpy( tags[num], p, q - p ); tags[num][q - p] = NUL; normalizeTag( tags[num] ); found = FALSE; for ( i = 0; i < num; i++ ) { if ( !strcmp( tags[i], tags[num] ) ) { found = TRUE; break; } } if ( found == FALSE ) { num++; if ( num >= numOfTags ) break; } q += 4; p = strstr( q, "<a class=\"hatena\" " ); } *curNum = num; } free( result ); } }