void CharFeatureCollection::calculateOutputDataMatrix(std::vector<ImageChar*> &imageChars)
{
	// check if there are features:
	if (mImageCharFeatureVec.empty()) {
		std::cout << "Exception in CharFeatureCollection::calculateOutputDataMatrix(std::vector<ImageChar> &imageChars): no feature vector list!" << std::endl;
		throw NoDataException("No features specified for feature vector calculation!");
	}

	// calc. size of one column:
	int nCols = 0;
	for (int i=0; i<mImageCharFeatureVec.size(); ++i) {
		nCols += mImageCharFeatureVec[i]->vectorSize();
	}
	// resize output matrix:
	const int nRows = imageChars.size();
	std::cout << "2 matrix dimensions are: " << nRows << " x " << nCols << std::endl;
	mDataMatrix.resize(nRows, nCols);
#if 1 // take preprocessed image from pointer in ImageChar
	#pragma omp parallel for
	for (int i=0; i<nRows; ++i) {
#if 0
		if (imageChars[i]->mPreprocessingResults.isEmpty()) {
			imageChars[i]->mPreprocessingResults
		}

#else
//		GrayImage<> *pPreprImage = imageChars[i]->pPreprImage;
		GrayImage<> *pPreprImage = imageChars[i]->mPreprocessingResults.mpProcessedImage;
		if (!pPreprImage) {
			std::cerr << "Preprocessing image not available while computing features!" << std::endl;
			throw NoDataException("Preprocessing image not available while computing features!");
		}
#endif
		this->calculateOutputDataRow(*pPreprImage, i, mDataMatrix);
		std::cout << "2Computed feature " << i+1 << " of " << nRows << std::endl;
		// store reference to feature vector in ImageChar of mpImageCharsVec:
		imageChars[i]->setFeatureData(&mDataMatrix, i);
	} // end for all rows i
#else // extract bounding box image from ImageChar
	for (int i=0; i<nRows; ++i) {
		GrayImage<> *pImage = imageChars[i]->pImage;
		BoundingBox bbox = imageChars[i]->bBox;
		pImage->setRoi(bbox);
		GrayImage<> charImage;
		pImage->computeRoiImage(charImage);
		pImage->releaseRoi();
//		std::cout << "current row: " << i << std::endl;
		this->calculateOutputDataRow(charImage, i, mDataMatrix);
		// store reference to feature vector in ImageChar of mpImageCharsVec:
		imageChars[i]->setFeatureData(&mDataMatrix, i);
	} // end for all rows i
#endif

	return;
} // end calculateOutputDataMatrix
void CharFeatureCollection::calculateOutputDataMatrix(const std::vector<GrayImage<> *> &images)
{
	// check if there are features:
	if (mImageCharFeatureVec.empty()) {
		std::cout << "Exception in CharFeatureCollection::calculateOutputDataMatrix(const std::vector<GrayImage<> > &images): no feature vector list!" << std::endl;
		throw NoDataException("No features specified for feature vector calculation!");
	}

	// calc. size of one column:
	int nCols = 0;
	for (int i=0; i<mImageCharFeatureVec.size(); ++i) {
		nCols += mImageCharFeatureVec[i]->vectorSize();
	}
	// resize output matrix:
	const int nRows = images.size();
	std::cout << "Matrix dimensions are: " << nRows << " x " << nCols << std::endl;
	mDataMatrix.resize(nRows, nCols);
	#pragma omp parallel for
	for (int i=0; i<nRows; ++i) {
		this->calculateOutputDataRow(*images[i], i, mDataMatrix);
		std::cout << "1Computed feature " << i+1 << " of " << nRows << std::endl;
	} // end for all rows i

	return;
}
Пример #3
0
void KMeans::startClustering(const ClusterMethodParameters* pParameters, Document *pDocument, ClusteringResult *pClusteringResult)
{
	mpDocument = pDocument;
	mpClusteringResult = pClusteringResult;

	this->mpDocument->setDistanceType(pParameters->dataType);
	if (pParameters->dataType == DISTANCE_BASED) {
		throw Exception("Distance based clustering not possible for KMeans!");
	}

	std::cout << "computing features..." << std::endl;
	this->mpDocument->computeFeatures();
	std::cout << "finished..." << std::endl;


#if 1 // old code
	const KMeansParameters *pParams = (const KMeansParameters*)(pParameters);

	std::cout << "starting kmeans clustering..." << std::endl;
	std::cout << "nr of clusters: " << pParams->nClusters << std::endl;
	std::cout << "stopping parameters (max its, eps): " << pParams->maxIts << ", " << pParams->eps << std::endl;

    StopWatch watch;
    watch.start();
	ublas::matrix<float>& dataMatrix = this->mpDocument->getCharFeatureCollectionPointer()->dataMatrixRef();
//	std::cout << dataMatrix << std::endl;
//	std::cout << dataMatrix.size1() << ", " << dataMatrix.size2() << std::endl;
	const int nSamples = dataMatrix.size1();
	if (dataMatrix.size1()*dataMatrix.size2() <= 0) {
		throw NoDataException("No features found for clustering with kmeans!");
	}

    CvMat *dataMatrixOCV = OpenCV::cvMatFromBoostMat<float>(dataMatrix);
//    std::cout << "dataMatrixOCV, rows = " << dataMatrixOCV->rows << ", cols = " << dataMatrixOCV->cols << std::endl;
//    std::cout << "computed data matrix opencv!" << std::endl;

    CvMat* clusters = cvCreateMat( nSamples, 1, CV_32SC1 );
    cvKMeans2( dataMatrixOCV, pParams->nClusters, clusters,
    		cvTermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, pParams->maxIts, pParams->eps ) );
//    std::cout << "cvmat rows = " << clusters->rows << ", cols = " << clusters->cols << std::endl;
    std::cout << "finished k-means clustering using opencv!" << std::endl;
    watch.stop();

    // save cluster label vector to cluster result:
    std::vector<int> labels;
    for (int i=0; i<nSamples; ++i) { labels.push_back(clusters->data.i[i]); }
    mpClusteringResult->createClusteringResultFromLabelVector(labels, mpDocument);

    pDocument->clearAllPreprocessing();
    pDocument->clearFeatures();
	pClusteringResult->computePrototypeFeatures();
	pClusteringResult->updatePrototypes(true);

    cvReleaseMat( &clusters );
#endif

} // end startClustering
void CLARANSClustering::startClustering(const ClusterMethodParameters* pParameters, Document *pDocument, ClusteringResult *pClusteringResult)
{
	mpDocument = pDocument;
	mpClusteringResult = pClusteringResult;

	this->mpDocument->setDistanceType(pParameters->dataType);

	// retrieve parameters:
	const CLARANSParameters *pParams = (const CLARANSParameters*)(pParameters);

	std::cout << "CLARANS parameters: numClust = " << pParams->numClust << ", numLocal = " << pParams->numLocal << ", maxNeighbor = " << pParams->maxNeighbor << std::endl;

	mpClusteringResult->deleteClustering(); // delete probably old clustering result
	const int nSamples = this->mpDocument->nParsedImages();
	const int nClusts = pParams->numClust;
	if (nSamples < 2) {
		throw NoDataException("No data found for clustering!");
	}
	if (pParameters->dataType == FEATURES_BASED && this->mpDocument->nFeatures() < 2) {
		throw NoDataException("No features found for clustering!");
	}
	// get pointer to image chars:
	std::vector<ImageChar*> *imageCharVecPointer = this->mpDocument->getImageCharsVecPointer();

	// INITIALIZATION:
//	srand(time(NULL)); // FIXME!!!!!!!!!!!!!
	double minCost = 1e32f; minCost = 1e32f;
	double actCost = 0.0f; double newCost = 0.0f;
	std::vector<int> currentNode;
	getRandomNode(nSamples, nClusts, currentNode);
	actCost = getNodeCost(nSamples, currentNode);
	std::cout << "Starting node: "; printNode(currentNode);
	// LOOP:
	int j = 1; j=1;
	std::vector<int> neighborNode;
	getRandomNeighbor(nSamples, currentNode, neighborNode); // determine a random neighbor node
	newCost = getNodeCost(nSamples, neighborNode);

	std::cout << "Finished CLARANS clustering!" << std::endl;
	return;
} // end startClustering(...)
Пример #5
0
void terrama2::core::DataRetrieverFTP::retrieveDataCallback(const std::string& mask,
                                                            const Filter& filter,
                                                            const std::string& timezone,
                                                            std::shared_ptr<terrama2::core::FileRemover> remover,
                                                            const std::string& temporaryFolderUri,
                                                            const std::string& foldersMask,
                                                            std::function<void(const std::string &, const std::string &, const std::string&)> processFile) const
{
  try
  {
    // find valid directories
    std::vector< std::string > baseUriList;
    baseUriList.push_back(dataProvider_->uri);

    auto tz = timezone.empty() ? "UTC+00" : timezone;

    if(!foldersMask.empty())
    {
      auto uriList = getFoldersList(baseUriList, foldersMask, tz, filter);

      if(uriList.empty())
      {
        QString errMsg = QObject::tr("No files found!");
        TERRAMA2_LOG_WARNING() << errMsg;
        throw terrama2::core::NoDataException() << ErrorDescription(errMsg);
      }

      baseUriList = uriList;
    }

    // Build URI to get PATH fragment
    te::core::URI dataProviderURI(dataProvider_->uri);
    // Set temporary directory. When empty, creates a new pointing to dataProvider Path.
    // In this way, we will have something like "temporaryDir/dataProviderPath"
    // It is important due the folder may contains temporal mask
    std::string temporaryDataDir = temporaryFolderUri;
    if (temporaryFolderUri.empty())
      temporaryDataDir = getTemporaryFolder(remover, temporaryFolderUri) + dataProviderURI.path();

    // flag if there is any files for the dataset
    bool hasData = false;
    // Get a file listing from server
    for(const auto& uri : baseUriList)
    {
      std::vector<std::string> vectorFiles = curlwrapper_->listFiles(normalizeURI(uri));

      std::vector<std::string> vectorNames;
      // filter file names that should be downloaded.
      for(const std::string& fileName: vectorFiles)
      {
        // FIXME: use timestamp
        std::shared_ptr< te::dt::TimeInstantTZ > timestamp;
        if(terrama2::core::isValidDataSetName(mask,filter, tz, fileName,timestamp))
          vectorNames.push_back(fileName);
      }

      if(vectorNames.empty())
      {
        continue;
      }

      hasData = true;

      te::core::URI u(uri);
      std::string uriPath = QString::fromStdString(u.path()).replace(dataProviderURI.path().c_str(), "/").toStdString();

      // Performs the download of files in the vectorNames
      for(const auto& file: vectorNames)
      {
        // Create directory struct
        QString saveDir(QString::fromStdString(temporaryDataDir+ "/" + uriPath));
        QString savePath = QUrl(saveDir).toLocalFile();
        QDir dir(savePath);
        if(!dir.exists())
          dir.mkpath(savePath);

        std::string uriOrigin = uri + "/" + file;
        std::string filePath = savePath.toStdString() + "/" + file;

        remover->addTemporaryFolder(temporaryDataDir);
        remover->addTemporaryFile(filePath);

        try
        {
          curlwrapper_->downloadFile(normalizeURI(uriOrigin).uri(), filePath);
          TERRAMA2_LOG_WARNING() << QObject::tr("Finished downloading file: %1").arg(QString::fromStdString(file));
          processFile(temporaryDataDir, file, uriPath);
        }
        catch(const te::Exception& e)
        {
          QString errMsg = QObject::tr("Error during download of file %1.\n").arg(QString::fromStdString(file));
          auto errStr = boost::get_error_info<te::ErrorDescription>(e);
          if(errStr)
            errMsg.append(QString::fromStdString(*errStr));
          errMsg.append(e.what());

          TERRAMA2_LOG_ERROR() << errMsg;
          throw DataRetrieverException() << ErrorDescription(errMsg);
        }
      }
    }

    if(!hasData)
    {
      QString errMsg = QObject::tr("No data in the remote server.");
      TERRAMA2_LOG_WARNING() << errMsg;
      throw NoDataException() << ErrorDescription(errMsg);
    }
  }
  catch(const NoDataException&)
  {
    throw;
  }
  catch(const DataRetrieverException&)
  {
    throw;
  }
  catch(const te::Exception& e)
  {
    QString errMsg = QObject::tr("Error during download.\n");
    errMsg.append(boost::get_error_info<terrama2::ErrorDescription>(e));
    errMsg.append(e.what());

    TERRAMA2_LOG_ERROR() << errMsg;
    throw DataRetrieverException() << ErrorDescription(errMsg);
  }
  catch(const std::exception& e)
  {
    QString errMsg = QObject::tr("Error during download.\n");
    errMsg.append(e.what());

    TERRAMA2_LOG_ERROR() << errMsg;
    throw DataRetrieverException() << ErrorDescription(errMsg);
  }
  catch(...)
  {
    throw DataRetrieverException() << ErrorDescription(QObject::tr("Unknown Error."));
  }
}
void AgglomerativeClustering::startClustering(const ClusterMethodParameters* pParameters, Document *pDocument, ClusteringResult *pClusteringResult)
{
	mpDocument = pDocument;
	mpClusteringResult = pClusteringResult;
	// preprocess glyphs or compute features, depending on distance type:
	if (pParameters->dataType == FEATURES_BASED) {
		pDocument->computeFeatures();
	}
	else {
		pDocument->preprocessAllGlyphs();
	}
//	this->mpDocument->setDistanceType(pParameters->dataType);

	// retrieve parameters:
	const AgglomerativeParameters *pParams = (const AgglomerativeParameters*)(pParameters);

	// print some debug info:
	std::cout << "Starting agglomerative clustering algorithm..." << std::endl;
	std::cout << "nr of clusters: " << pParams->nClusters << std::endl;
	std::cout << "feature dist type: " << pParams->featureDistType << std::endl;
	std::cout << "cluster dist type: " << pParams->clusterDistType << std::endl;

	mpClusteringResult->deleteClustering(); // delete probably old clustering result
//	setClusterDistanceTypeFunctionPointer(pParams->clusterDistType);
	const int nSamples = this->mpDocument->nParsedImages();
	if (nSamples < 2) {
		throw NoDataException("No data found for clustering!");
	}
	if (pParameters->dataType == FEATURES_BASED && this->mpDocument->nFeatures() < 2) {
		throw NoDataException("No features found for clustering!");
	}

	StopWatch watch;
	// initialize distance matrix:
	std::cout << "computing distance matrix..." << std::endl;
	watch.start();
	initDistanceMatrix();
	watch.stop();
	std::cout << "successfully computed distance matrix" << std::endl;

	// TEST: USING CLUSTER LIBRARY:
	double **tmpdistmatrix;

	// Allocate memory for distance matrix
	tmpdistmatrix = new double*[nSamples];
	for (int i = 0; i < nSamples; ++i)
		tmpdistmatrix[i] = new double[nSamples];
	// copy distance matrix
	for (int i=0;i<nSamples;++i){
		for (int j=0; j<=i; ++j) {
			tmpdistmatrix[i][j] = mDistMat(i,j);
			tmpdistmatrix[j][i] = mDistMat(i,j);
		}
	}
	std::vector<int> labels(nSamples);
#if 0
	int npass = 100;
	double error;
	int ifound;
	watch.start();
	kmedoids (pParams->nClusters, nSamples, tmpdistmatrix,
			npass, &labels[0], &error, &ifound);
	watch.stop();
	std::cout << "finished kmedioids, error = " << error << ", ifound = " << ifound << std::endl;

#else
	// method = 's' (single-linkage), 'm' (complete-linkage), 'a' (average-linkage) or 'c' (centroid-linkage):
	char methodChar='a';
	switch (pParams->clusterDistType) {
		case AVG_DIST:
			methodChar='a';
			break;
		case MIN_DIST:
			methodChar='s';
			break;
		case MAX_DIST:
			methodChar='m';
			break;
		default:
			throw Exception("Unknown distance type in AgglomerativeClustering::startClustering()");
			break;
	} // end switch

	Node* tree = treecluster(nSamples, 1, NULL, NULL, NULL, 0, '_', methodChar, tmpdistmatrix);
	if (tree==NULL)
		std::cerr << "FATAL ERROR - NULL POINTER IN CLUSTER RESULT!" << std::endl;

	// cut hierarchical cluster tree at specified nr of clusters:
	cuttree(nSamples, tree, pParams->nClusters, &labels[0]);
	delete [] tree;
#endif

	// De-Allocate memory for temporary distance matrix:
	for (int i = 0; i < nSamples; ++i)
		delete [] tmpdistmatrix[i];
	delete [] tmpdistmatrix;
	std::cout << "finished agglo clustering with cluster libarary!!" << std::endl;
	pClusteringResult->createClusteringResultFromLabelVector(labels, pDocument);
	/////////////////// END TEST

#if 0
	// get pointer to image chars:
	std::vector<ImageChar*> *imageCharVecPointer = this->mpDocument->getImageCharsVecPointer();

	// create cluster for each instance:
	CharCluster *pCluster=NULL;
	for (int i=0; i<nSamples; ++i) {
		pCluster = mpClusteringResult->addEmptyCluster(this->mpDocument);
		pCluster->addChar( (*imageCharVecPointer)[i] );
	}


	std::cout << "Starting merging process..." << std::endl;
//	watch.start();
	std::vector<float> minValVec;
	std::vector<int> nClustsVec;
	// while nr of clusters not reached -> merge two nearest clusters
//	while (mClusterVec.size() > pParams->nClusters) {
	while (mpClusteringResult->nClusters() > pParams->nClusters) {
		// find min element of distance matrix:
		minValVec.push_back(mMinDist);
//		nClustsVec.push_back(mClusterVec.size());
		nClustsVec.push_back(mpClusteringResult->nClusters());
//		std::cout << "nr of clusters is " << mClusterVec.size() << std::endl;
//		std::cout << "min dist is " << mMinDist << " on index " << mMinIndex << std::endl;
		watch.start();
		updateClusterLabels();
		std::cout << "updated cluster labels, time = " << watch.stop(false) << std::endl;

		watch.start();
		findMinDist();
		std::cout << "found min dist, time = " << watch.stop(false) << std::endl;
		std::cout << "new nr of clusters is " << mpClusteringResult->nClusters() << std::endl;
	} // end while
//	writeTxtFile(nClustsVec, "C:/projekte/impact/matlab_sebastian/c_prog_out/n_clusts_vec.txt");
//	writeTxtFile(minValVec, "C:/projekte/impact/matlab_sebastian/c_prog_out/min_val_vec.txt");
//	watch.stop();
#endif

	pDocument->clearAllPreprocessing();
	pDocument->clearFeatures();
	pClusteringResult->computePrototypeFeatures();
	pClusteringResult->updatePrototypes(true);

	return;
} // end startClustering()
void CharDataReaderI::parseFiles(Document *pDocument, std::vector<ImageChar> &imageCharVec) const
{
	imageCharVec.clear();

	ParsingParameters pars = pDocument->parsingParameters();
	const ParseSubsetType pType = pars.parsingType;
	const SubsetGenerationType genType = pars.generationType;
	const ParsingMethodType mType = pars.parsingMethod;

	if (mType == UNKNOWN_PARSING_METHOD) {
		throw Exception("An invalid parsing method type was given!");
	}
	if (pType == UNKNOWN_PARSESUBSET_TYPE) {
		throw Exception("An invalid parsing subset type was given!");
	}
	if (genType == UNKNOWN_GENERATION_TYPE) {
		throw Exception("An invalid parsing subset generation type was given!");
	}

	// load document images if not done yet:
	if (!pDocument->isDocumentImagesLoaded()) {
		throw NoDataException("No input documents opened for parsing!");

//		std::cout << "Opening document images for parsing, parsing type is: " << pars.parsingMethod << std::endl;
//		pDocument->openDocumentImages();
	}
	// create XML filenames:
	std::string fileExtension = "";
	if (mType == FINEREADER_XML || mType == IM2CHARRECTS_XML) {
		fileExtension = "xml";
	}
	else if (mType == DAT_FILES) {
		fileExtension = "dat";
	}

	std::cout << "file extension = " << fileExtension << std::endl;
	pDocument->createSegmentationResultFileNames(fileExtension, pars.filenamePrefix,  pars.filenameSuffix);
//	std::cout << "created segmentation result filenames" << std::endl;

	for (int i=0; i<pDocument->nFiles(); ++i) {
		std::cout << "Now parsing image file " << i << std::endl;

		// 1st: parse single image file, store result in tmp-vectors:
		std::vector<ImageChar> tmpImageCharVec;
		this->parseSingleImageFile(pDocument->fileName(i), pDocument->segmentationResultFileName(i), pDocument->imagePointer(i), tmpImageCharVec, i);

		// 2nd: sort out susp. / non-susp. if needed, store result in tmp-vectors:
		std::vector<ImageChar> tmpImageCharVec2;
		for (int j=0; j<tmpImageCharVec.size(); ++j) {
			if ((pType == PARSE_ALL) || (pType == PARSE_ONLY_NON_SUSP && !tmpImageCharVec[j].suspicious) || (pType == PARSE_ONLY_SUSP && tmpImageCharVec[j].suspicious)) {
				tmpImageCharVec2.push_back(tmpImageCharVec[j]);
			}
		} // end for all parsed chars j

		// 3rd: generate subset of parsed chars according to generation type:
		tmpImageCharVec.clear();

		double percentage = double(pars.subsetPerc) / 100.0f;
		int nCharsToPars = ceil(tmpImageCharVec2.size() * percentage);
		assert(nCharsToPars<=tmpImageCharVec2.size());
		std::cout << "Parsing " << pars.subsetPerc << "% of " << tmpImageCharVec2.size() << " nr of chars = " << nCharsToPars << std::endl;
		if (genType == PARSE_RANDOMLY) { // take nCharsToPars nr of characters randomly from page
			std::vector<int> rand_tuple = RandomNumber::random_unique_integer_tuple(tmpImageCharVec2.size());
			for (int k=0; k<nCharsToPars; k++) {
				imageCharVec.push_back(tmpImageCharVec2[rand_tuple[k]]);
				imageCharVec[imageCharVec.size()-1].id = k;
			}
		}
		else if (genType == PARSE_FIRST) { // take nCharsToPars nr of characters first from page
			for (int k=0; k<nCharsToPars; k++) {
				imageCharVec.push_back(tmpImageCharVec2[k]);
				imageCharVec[imageCharVec.size()-1].id = k;
			}
		}
		else if (genType == PARSE_LAST) {
			for (int l=0; l<nCharsToPars; l++) {
				int k = tmpImageCharVec2.size()-l-1;
//				std::cout << "k = " << k << std::endl;
//				std::cout << "(tmpImageCharVec2.size()-nCharsToPars) = " << (tmpImageCharVec2.size()-nCharsToPars) << std::endl;
				imageCharVec.push_back(tmpImageCharVec2[k]);
				imageCharVec[imageCharVec.size()-1].id = k;
			}
		}
	} // end for all files i

	return;
}