int bbdocument_convert(char filetype[],char document[],const int dokument_size, buffer *outbuffer, const char titlefromadd[], char *subname, char *documenturi, unsigned int lastmodified, char *acl_allow, char *acl_denied, struct hashtable **metahash) {

	FILE *filconfp=NULL;
	char filconvertetfile_real[216] = "";
	char filconvertetfile_out_txt[216] = "";
	char filconvertetfile_out_html[216] = "";
	int exeocbuflen;
	int i;
	char *documentfinishedbuftmp;
	char fileconverttemplate[1024];
	struct fileFilterFormat *fileFilter = NULL;

        #ifdef DEBUG_TIME
                struct timeval start_time, end_time;
	#endif

	printf("bbdocument_convert: dokument_size %i, title \"%s\",filetype \"%s\"\n",dokument_size,titlefromadd,filetype);

	//konverterer filnavn til liten case
	for (i=0;i < strlen(filetype);i++) {
		//printf("%c\n",filetype[i]);
		filetype[i] = btolower(filetype[i]);
	}

	//hvis vi har et html dokument kan vi bruke dette direkte
	//er dog noe uefektist her, ved at vi gjør minnekopiering
	if ((strcmp(filetype,"htm") == 0) || (strcmp(filetype,"html") == 0 )) {
		if (titlefromadd[0]=='\0') {
			bmemcpy(outbuffer, document, dokument_size);
		}
		else {
			// Noen dokumenter kan ha lagt ved tittel ved add uten å ha tittel i html-en (f.eks epost).
			// Legg til korrekt tittel i dokumentet.
			// Html-parseren tar kun hensyn til den første tittelen, så det skal holde å legge den til
			// øverst i dokumentet.
			bprintf(outbuffer, "<title>%s</title>\n", titlefromadd);
			bmemcpy(outbuffer, document, dokument_size);
		}
		return 1;
	}
	else if (strcmp(filetype,"hnxt") == 0) {
		ntobr(document, dokument_size);
		bprintf(outbuffer, html_tempelate, titlefromadd, document);
		return 1;
	}

	#ifdef DEBUG
	printf("strcmp done\n");
	#endif

	struct fileFilterFormat *fileFilterOrginal;



	
	if (NULL == (fileFilterOrginal = hashtable_search(h_fileFilter,filetype) )) {
		printf("don't have converter for \"%s\"\n",filetype);

		#ifdef DEBUG
			printf("writing to unknownfiltype.log\n");
			FILE *fp;
			if ((fp = fopen(bfile("logs/unknownfiltype.log"),"ab")) == NULL) {
				perror(bfile("logs/unknownfiltype.log"));
			}
			else {
				printf("title %s\n",titlefromadd);
				printf("filetype %s\n",filetype);
				fprintf(fp,"%s: %s\n",titlefromadd,filetype);
				fclose(fp);
			}	
			printf("writing to unknownfiltype.log. done\n");
		#endif

		return 0;
	}

	//hvis dette er en fil av type text trenger vi ikke og konvertere den.
	if (strcmp((*fileFilterOrginal).format,"text") == 0) {
		#ifdef DEBUG
			printf("fileFilter ses it is a file of format text. Can use it direktly\n");
		#endif

		char *cpbuf;
		int cpbufsize;

		//konvertere alle \n til <br>
		cpbufsize = (dokument_size + 512 +1);
		cpbuf = malloc(cpbufsize);

		memcpy(cpbuf,document,dokument_size);
		cpbuf[dokument_size] = '\0';

		//stripper < og > tegn, da html parseren vil tro det er html tagger.
		//de er jo som kjent på formater < og >
		stripTags(cpbuf,dokument_size);

		#ifdef DEBUG
		printf("document %i\n",strlen(document));
		#endif

		bprintf(outbuffer, html_text_tempelate, titlefromadd, cpbuf);
		//printf("documentfinishedbuf %i\n", buffer_length(outbuffer));
		free(cpbuf);

                return 1;
	}

	//vi må lage en kopi av filfilter infoen, da vi skal endre den.
	fileFilter = malloc(sizeof(struct fileFilterFormat));

	memcpy(fileFilter, fileFilterOrginal, sizeof(struct fileFilterFormat));

	#ifdef DEBUG
		printf("have converter for file type\n");
	#endif


	/*****************************************************************************
		Vi har konverter. Må skrive til fil får å kunne sende den med
	*****************************************************************************/

	pid_t pid = getpid();

	sprintf(fileconverttemplate, "%s-%d", filconvertetfile, rand());
	sprintf(filconvertetfile_real,"%s-%u.%s",fileconverttemplate, (unsigned int)pid,filetype);
	sprintf(filconvertetfile_out_txt,"%s-%u.txt",fileconverttemplate, (unsigned int)pid);
	sprintf(filconvertetfile_out_html,"%s-%u.html",fileconverttemplate, (unsigned int)pid);

	#ifdef DEBUG
	printf("bbdocument_convert: filconvertetfile_real \"%s\"\n",filconvertetfile_real);
	#endif
	if ((filconfp = fopen(filconvertetfile_real,"wb")) == NULL) {
		perror(filconvertetfile_real);
		exit(1);
	}
	flock(fileno(filconfp),LOCK_EX);
	fwrite(document,1,dokument_size,filconfp);
	fclose(filconfp);

	//reåpner den read only, ogi lager en delt lås på filen, slik at vi ungår at perl /tmp watch sletter den.
	if ((filconfp = fopen(filconvertetfile_real,"rb")) == NULL) {
		perror(filconvertetfile_real);
		exit(1);
	}
	flock(fileno(filconfp),LOCK_SH);

	//convert to text.
	/*****************************************************************************/


	strsandr((*fileFilter).command,"#file",filconvertetfile_real);
	strsandr((*fileFilter).command,"#outtxtfile",filconvertetfile_out_txt);
	strsandr((*fileFilter).command,"#outhtmlfile",filconvertetfile_out_html);
	exeocbuflen = (dokument_size * 2) + 513; // XXX: Find a better way //(*documentfinishedbufsize);
	if ((documentfinishedbuftmp = malloc(exeocbuflen)) == NULL) {
		perror("Can't malloc documentfinishedbuftmp");
		return 0;
	}

	switch (fileFilter->filtertype) {
		case FILTER_EXEOC:
			run_filter_exeoc(
				documentfinishedbuftmp,  
				exeocbuflen,
				fileFilter, 
				metahash
			);
			break;
		case FILTER_PERL_PLUGIN:
			run_filter_perlplugin(
				documentfinishedbuftmp,
				exeocbuflen ,
				fileFilter,
				metahash
			);
			break;
		default:
			errx(1, "Unknown filtertype '%d'", fileFilter->filtertype);
	}

#ifdef USE_LIBEXTRACTOR
	if (fileFilter->attrwhitelist != NULL)
		add_libextractor_attr(metahash, filconvertetfile_real, fileFilter->attrwhitelist);
#endif


//<<<<<<< bbdocument.c

//=======
	//her parser vi argumenter selv, og hver space blir en ny argyment, selv om vi 
	//bruker "a b", som ikke riktig blir to argumenter her, a og b
	//splitter på space får å lage en argc
	/*TokCount = split((*fileFilter).command, " ", &splitdata);
	//#ifdef DEBUG
	printf("splitet comand in %i, program is \"%s\"\n",TokCount,splitdata[0]);
	//#endif
	printf("running: %s\n",(*fileFilter).command);
	//sender med størelsen på buferen nå. Vil få størelsen på hva vi leste tilbake
	char *execobuf = malloc(exeocbuflen);
//>>>>>>> 1.64

//<<<<<<< bbdocument.c
//=======
	char *envpairpath = strdup(/tmp/converter-metadata-XXXXXX);
	char envpair[PATH_MAX];
	mktemp(envpairpath);
	sprintf(envpair, "SDMETAFILE=%s", envpairpath);
	free(envpairpath);
	envpairpath = envpair + strlen("SDMETAFILE=");
        char *shargs[] = { "/usr/bin/env", NULL, "/bin/sh", "-c", NULL, NULL, };
	shargs[1] = envpair;
        shargs[4] = fileFilter->command;

        #ifdef DEBUG_TIME
                gettimeofday(&start_time, NULL);
        #endif

	if (!exeoc_timeout(shargs, execobuf, &exeocbuflen, &ret, 120)) {
		printf("dident get any data from exeoc. But can be a filter that creates files, sow we wil continue\n");
		execobuf[0] = '\0';
		exeocbuflen = 0;
	}

        #ifdef DEBUG_TIME
                gettimeofday(&end_time, NULL);
                printf("Time debug: exeoc_timeout() time: %f\n",getTimeDifference(&start_time, &end_time));
        #endif
	*/
/*
	if (metahash) {
		FILE *metafp;

		*metahash = create_hashtable(3, ht_stringhash, ht_stringcmp);

		if ((metafp = fopen(envpairpath, "r")) != NULL) {
			char *key, *value, line[2048];

			while (fgets(line, sizeof(line), metafp)) {
				char *p, *p2;

				// Comment
				if (line[0] == '#')
					continue;

				key = line;
				p = strchr(key, '=');
				if (p == NULL) {
					fprintf(stderr, "Invalid format on meta spec file: %s\n", line);
					continue;
				}
				p2 = p;
				while (isspace(*(p2-1)))
					p2--;
				*p2 = '\0';
				p++; // Skip past = 
				while (isspace(*p))
					p++;
				value = p;
				while (isspace(*key))
					key++;

				if (value[strlen(value)-1] == '\n')
					value[strlen(value)-1] = '\0';
				printf("Got pair: %s = %s\n", key, value);
				hashtable_insert(*metahash, strdup(key), strdup(value));
			}
			fclose(metafp);
			unlink(envpairpath);
		} else {
			printf("Couldn't open %s\n", envpairpath);
		}
	} */

//>>>>>>> 1.64
#ifdef DEBUG
	//printf("did convert to %i bytes (strlen %i)\n",exeocbuflen,strlen(documentfinishedbuftmp));
#endif

	if (strcmp((*fileFilter).outputformat,"text") == 0) {
                //stripper < og > tegn, da html parseren vil tro det er html tagger.
                //de er jo som kjent på formater < og >
                stripTags(documentfinishedbuftmp,strlen(documentfinishedbuftmp));

		bprintf(outbuffer, html_text_tempelate,titlefromadd,documentfinishedbuftmp);
	}
	else if (strcmp((*fileFilter).outputformat,"html") == 0) {
		//html trenger ikke å konvertere
		//dette er altså outputformat html. Ikke filtype outputformat. Filtupe hondteres lengere oppe
		//ToDo: må vel kopiere inn noe data her???
		bprintf(outbuffer, "%s", documentfinishedbuftmp);
		// Ved filkonvertering vil tittelen som sendes med (from add) være filnavnet.
		// Den vil vi kun bruke dersom dokumentet i seg selv ikke har en tittel.
		// Derfor legges den tittelen til nederst i dokumentet:
		bprintf(outbuffer, "<title>%s</title>\n", titlefromadd);
	}
	else if (strcmp((*fileFilter).outputformat,"textfile") == 0) {
		FILE *fh;
		struct stat inode; 
		char *cpbuf;
		printf("filconvertetfile_out_txt: \"%s\"\n",filconvertetfile_out_txt);

		if ((fh = fopen(filconvertetfile_out_txt,"rb")) == NULL) {
			printf("can't open out file \"%s\"\n",filconvertetfile_out_txt);
			perror(filconvertetfile_out_txt);
			goto bbdocument_convert_error;
		}		
       		fstat(fileno(fh),&inode);


                if ((cpbuf = malloc(inode.st_size +1)) == NULL) {
			perror("malloc");
			goto bbdocument_convert_error;
		}
                
        	fread(cpbuf,1,inode.st_size,fh);
		cpbuf[inode.st_size] = '\0';

		printf("did read back %i bytes from file \"%s\"\n",(int)inode.st_size,filconvertetfile_out_txt);

		printf("strlen cpbuf: %i\n",strlen(cpbuf));

                //stripper < og > tegn, da html parseren vil tro det er html tagger.
                //de er jo som kjent på formater < og >
                stripTags(cpbuf,inode.st_size);

		fclose(fh);

		//printf("have size %i\n",(*documentfinishedbufsize));

		bprintf(outbuffer, html_text_tempelate, titlefromadd, cpbuf);
		free(cpbuf);

		//seltter filen vi lagde
		unlink(filconvertetfile_out_txt);
	}
	else if (strcmp((*fileFilter).outputformat,"htmlfile") == 0) {
		FILE *fh;
		struct stat inode; 
		size_t n;
		char buf[4096];
		if ((fh = fopen(filconvertetfile_out_html,"rb")) == NULL) {
			printf("can't open out file \"%s\"\n",filconvertetfile_out_html);
			perror(filconvertetfile_out_html);
			goto bbdocument_convert_error;
		}		
       		fstat(fileno(fh),&inode);
#if 0
		if ((*documentfinishedbufsize) > inode.st_size) {
			(*documentfinishedbufsize) = inode.st_size;
		}
#endif
		while ((n = fread(buf, 1, sizeof(buf)-1, fh)) > 0) {
			bmemcpy(outbuffer, buf, n);
		}

		fclose(fh);
		unlink(filconvertetfile_out_html);
		bprintf(outbuffer, "<title>%s</title>\n", titlefromadd);
	}
	else if (strcmp(fileFilter->outputformat, "dir") == 0 || strcmp(fileFilter->outputformat, "diradd") == 0) {
		char *p, *pstart;
		/* Does len do anything any more? */
		int len, failed = 0;
		int type; /* 1 for dir, 2 for diradd */

		type = (strcmp(fileFilter->outputformat, "dir") == 0) ? 1 : 2;

		len = exeocbuflen;
		p = strdup(documentfinishedbuftmp);
		pstart = p;
		if (p == NULL) {
			goto bbdocument_convert_error;
		}
		bprintf(outbuffer, html_text_tempelate, titlefromadd, "");
		while (*p != '\0') {
			char *ft, *path;
			char *part = NULL;

			ft = p;
			for (; *p != ' '; p++)
				len--;
			*p = '\0';
			if (type == 2) {
				part = ++p;
				for (; *p != ' '; p++)
					len--;
				*p = '\0';
			}
			path = ++p;
			/* XXX: strchr() */
			for (; *p != '\n'; p++)
				len--;

			if (*p == '\n')
				*p++ = '\0';

			/* We have a new file, let's get to work on it */
			//printf("########Got: %s: %s\n", ft, path);
			{
				char *docbuf;
				int docbufsize;
				struct stat st;
				FILE *fp;

				if (stat(path, &st) == -1) { /* Unable to access file, move on to the next */
					fprintf(stderr, "File: %s\n", path);
					perror("stat");
					failed++;
					continue;
				}

				docbuf = malloc(st.st_size + 1); /* Make room for our lovely '\0' */
				if (docbuf == NULL) {
					perror("malloc");
					failed++;
					free(docbuf);
					continue;
				}
				docbufsize = st.st_size;
				if ((fp = fopen(path, "r")) == NULL) {
					perror("fopen");
					failed++;
					free(docbuf);
					continue;
				}
				fread(docbuf, 1, docbufsize, fp);
				fclose(fp);
				unlink(path);
				docbuf[docbufsize] = '\0';

				//runarb: 18 jan 2008: har var titel "", ikke titlefromadd, som gjorde at 24so crawling mistet titler.
				if (bbdocument_convert(ft, docbuf, docbufsize, outbuffer, titlefromadd, subname, documenturi, lastmodified, acl_allow, acl_denied,  NULL) == 0) {
					fprintf(stderr, "Failed on bbdocument_convert.\n");
					failed++;
					free(docbuf);
					continue;
				}
				
				free(docbuf);
			}
		}
		if (type == 2) {
			assert(0);
#if 0
			*documentfinishedbufsize = 1;
			*documentfinishedbuf = strdup(".");
#endif
		}
		//printf("Got this: %d %d<<\n%s\n", strlen(*documentfinishedbuf), *documentfinishedbufsize, *documentfinishedbuf);
		free(pstart);
	}
	else {
		printf("unknown dokument outputformat \"%s\"\n",fileFilter->outputformat);
		free(documentfinishedbuftmp);
		goto bbdocument_convert_error;
	}

	free(documentfinishedbuftmp);

	unlink(filconvertetfile_real);
	unlink(filconvertetfile_out_txt);
	unlink(filconvertetfile_out_html);

	fclose(filconfp);

	#ifndef DEBUG
		//runarb: 13okr2007: hvorfor ver denne komentert ut? Det hoper seg opp med filer
		//unlink(filconvertetfile_real);
	#endif

	//printf("documentfinishedbuf is: \n...\n%s\n...\n", *documentfinishedbuf);

	free(fileFilter);

	return 1;

	bbdocument_convert_error:
		if (filconvertetfile_real[0] != '\0') {
			unlink(filconvertetfile_real);
		}
		if (filconvertetfile_out_txt[0] != '\0') {
			unlink(filconvertetfile_out_txt);
		}
		if (filconvertetfile_out_html[0] != '\0') {
			unlink(filconvertetfile_out_html);
		}

		if (filconfp != NULL) {
			fclose(filconfp);
		}
		if (fileFilter != fileFilter) {
			free(fileFilter);
		}

		if (fileFilter != NULL) {
			free(fileFilter);
		}
		return 0;
}
Beispiel #2
0
void aclElementNormalize (char acl[]) {

        strsandr(acl," ","_");
        strsandr(acl,"-","_");

}