Пример #1
0
int docs_to_xml (char * path, Input_Obj_Type tMode )
{
	int i = 0;
	SingleList_t tFileList, *pFileList=&tFileList, tFileTypeFilter, *pFileTypeFilter = &tFileTypeFilter;
	int xml_fd;
	char *ext = NULL;
	initList( pFileList );
	setFileTypeFilter ( pFileTypeFilter );

#ifndef OLD_ZRT
	if ( strcmp ( getext_( path ), "zip") == 0 )
	{
		get_file_list_inzip ( path , pFileList );
	}
	else
#endif
	if ( tMode == zip_obj )
		get_file_list_inzip( OBJECT_DEVICE_NAME, pFileList );
	else if ( tMode == mail_obj )
		get_file_list( path, pFileList, pFileTypeFilter );
	else
		return 0;

/*
	printf ( "list of indexed docs\n" );
	for ( i = 0; i < pFileList->count; i++)
		printf ( "%s\n", pFileList->list[i] );
*/

///////////////////////

	xml_fd = open_xml_( XML_PATH, tMode ); //FIXME const
	for ( i = 0; i < pFileList->count; i++)
	{
		add_doc_to_xml ( xml_fd, pFileList->list[i], tMode );
	}
	close_xml_( xml_fd );
///////////////////////

	freeList( pFileList );
	freeList( pFileTypeFilter );
	return 0;
}
Пример #2
0
int do_extract_text (char *input_file, char *output_file)
{
	char *ext = NULL;
	ext = (char *) getext_( input_file );

	if (ext == NULL)
		return 1;
	if ( (strcasecmp( ext, "doc" ) == 0) || (strcasecmp( ext, "rtf" ) == 0) )
	{
		char *argv_catdoc [] = {"catdoc", input_file};
		int argc_catdoc = PCHARSIZE( argv_catdoc );
		catdoc_main((int)argc_catdoc, argv_catdoc ); //
		//doc_to_text (input_file,  output_file ); // old doc extractor. antiword
	}
	else if (( strcasecmp( ext, "txt" ) == 0) || ( strcasecmp( ext, "sh" ) == 0) || ( strcasecmp( ext, "html" ) == 0) || ( strcasecmp( ext, "c" ) == 0) )
	{
		output_file [ strlen ( output_file ) - 4 ] = '\0';
	}
	else if ( strcasecmp( ext, "docx" ) == 0)
	{
		docx_to_text( input_file, output_file );
	}
	else if ( strcasecmp( ext, "odt" ) == 0)
	{
		docx_to_text( input_file, output_file );
	}
	else if ( strcasecmp( ext, "pdf" ) == 0)
	{
		pdf_to_text( input_file, output_file );
	}
	else
	{
		free (ext);
		return 1;
	}
	free (ext);
	return 0;
}
int docs_to_xml (char * path, int doc_type)
{
	int i = 0;
	SingleList_t tFileList, *pFileList=&tFileList, tFileTypeFilter, *pFileTypeFilter = &tFileTypeFilter;
	int xml_fd;
	char *ext = NULL;
	initList( pFileList );
	setFileTypeFilter ( pFileTypeFilter );

#ifndef OLD_ZRT
	if ( strcmp ( getext_( path ), "zip") == 0 )
	{
		get_file_list_inzip ( path , pFileList );
	}
	else
#endif
	print_dir_tree (path);
	get_file_list ( path, pFileList, pFileTypeFilter );

	printf ( "list of indexed docs\n" );
	for ( i = 0; i < pFileList->count; i++)
		printf ( "%s\n", pFileList->list[i] );

///////////////////////

	xml_fd = open_xml_( XML_PATH ); //FIXME const
	for ( i = 0; i < pFileList->count; i++)
	{
		add_doc_to_xml ( xml_fd, pFileList->list[i], doc_type );
	}
	close_xml_( xml_fd );
///////////////////////

	freeList( pFileList );
	freeList( pFileTypeFilter );
	return 0;
}