void CharFeatureCollection::calculateOutputDataMatrix(std::vector<ImageChar*> &imageChars) { // check if there are features: if (mImageCharFeatureVec.empty()) { std::cout << "Exception in CharFeatureCollection::calculateOutputDataMatrix(std::vector<ImageChar> &imageChars): no feature vector list!" << std::endl; throw NoDataException("No features specified for feature vector calculation!"); } // calc. size of one column: int nCols = 0; for (int i=0; i<mImageCharFeatureVec.size(); ++i) { nCols += mImageCharFeatureVec[i]->vectorSize(); } // resize output matrix: const int nRows = imageChars.size(); std::cout << "2 matrix dimensions are: " << nRows << " x " << nCols << std::endl; mDataMatrix.resize(nRows, nCols); #if 1 // take preprocessed image from pointer in ImageChar #pragma omp parallel for for (int i=0; i<nRows; ++i) { #if 0 if (imageChars[i]->mPreprocessingResults.isEmpty()) { imageChars[i]->mPreprocessingResults } #else // GrayImage<> *pPreprImage = imageChars[i]->pPreprImage; GrayImage<> *pPreprImage = imageChars[i]->mPreprocessingResults.mpProcessedImage; if (!pPreprImage) { std::cerr << "Preprocessing image not available while computing features!" << std::endl; throw NoDataException("Preprocessing image not available while computing features!"); } #endif this->calculateOutputDataRow(*pPreprImage, i, mDataMatrix); std::cout << "2Computed feature " << i+1 << " of " << nRows << std::endl; // store reference to feature vector in ImageChar of mpImageCharsVec: imageChars[i]->setFeatureData(&mDataMatrix, i); } // end for all rows i #else // extract bounding box image from ImageChar for (int i=0; i<nRows; ++i) { GrayImage<> *pImage = imageChars[i]->pImage; BoundingBox bbox = imageChars[i]->bBox; pImage->setRoi(bbox); GrayImage<> charImage; pImage->computeRoiImage(charImage); pImage->releaseRoi(); // std::cout << "current row: " << i << std::endl; this->calculateOutputDataRow(charImage, i, mDataMatrix); // store reference to feature vector in ImageChar of mpImageCharsVec: imageChars[i]->setFeatureData(&mDataMatrix, i); } // end for all rows i #endif return; } // end calculateOutputDataMatrix
void CharFeatureCollection::calculateOutputDataMatrix(const std::vector<GrayImage<> *> &images) { // check if there are features: if (mImageCharFeatureVec.empty()) { std::cout << "Exception in CharFeatureCollection::calculateOutputDataMatrix(const std::vector<GrayImage<> > &images): no feature vector list!" << std::endl; throw NoDataException("No features specified for feature vector calculation!"); } // calc. size of one column: int nCols = 0; for (int i=0; i<mImageCharFeatureVec.size(); ++i) { nCols += mImageCharFeatureVec[i]->vectorSize(); } // resize output matrix: const int nRows = images.size(); std::cout << "Matrix dimensions are: " << nRows << " x " << nCols << std::endl; mDataMatrix.resize(nRows, nCols); #pragma omp parallel for for (int i=0; i<nRows; ++i) { this->calculateOutputDataRow(*images[i], i, mDataMatrix); std::cout << "1Computed feature " << i+1 << " of " << nRows << std::endl; } // end for all rows i return; }
void KMeans::startClustering(const ClusterMethodParameters* pParameters, Document *pDocument, ClusteringResult *pClusteringResult) { mpDocument = pDocument; mpClusteringResult = pClusteringResult; this->mpDocument->setDistanceType(pParameters->dataType); if (pParameters->dataType == DISTANCE_BASED) { throw Exception("Distance based clustering not possible for KMeans!"); } std::cout << "computing features..." << std::endl; this->mpDocument->computeFeatures(); std::cout << "finished..." << std::endl; #if 1 // old code const KMeansParameters *pParams = (const KMeansParameters*)(pParameters); std::cout << "starting kmeans clustering..." << std::endl; std::cout << "nr of clusters: " << pParams->nClusters << std::endl; std::cout << "stopping parameters (max its, eps): " << pParams->maxIts << ", " << pParams->eps << std::endl; StopWatch watch; watch.start(); ublas::matrix<float>& dataMatrix = this->mpDocument->getCharFeatureCollectionPointer()->dataMatrixRef(); // std::cout << dataMatrix << std::endl; // std::cout << dataMatrix.size1() << ", " << dataMatrix.size2() << std::endl; const int nSamples = dataMatrix.size1(); if (dataMatrix.size1()*dataMatrix.size2() <= 0) { throw NoDataException("No features found for clustering with kmeans!"); } CvMat *dataMatrixOCV = OpenCV::cvMatFromBoostMat<float>(dataMatrix); // std::cout << "dataMatrixOCV, rows = " << dataMatrixOCV->rows << ", cols = " << dataMatrixOCV->cols << std::endl; // std::cout << "computed data matrix opencv!" << std::endl; CvMat* clusters = cvCreateMat( nSamples, 1, CV_32SC1 ); cvKMeans2( dataMatrixOCV, pParams->nClusters, clusters, cvTermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, pParams->maxIts, pParams->eps ) ); // std::cout << "cvmat rows = " << clusters->rows << ", cols = " << clusters->cols << std::endl; std::cout << "finished k-means clustering using opencv!" << std::endl; watch.stop(); // save cluster label vector to cluster result: std::vector<int> labels; for (int i=0; i<nSamples; ++i) { labels.push_back(clusters->data.i[i]); } mpClusteringResult->createClusteringResultFromLabelVector(labels, mpDocument); pDocument->clearAllPreprocessing(); pDocument->clearFeatures(); pClusteringResult->computePrototypeFeatures(); pClusteringResult->updatePrototypes(true); cvReleaseMat( &clusters ); #endif } // end startClustering
void CLARANSClustering::startClustering(const ClusterMethodParameters* pParameters, Document *pDocument, ClusteringResult *pClusteringResult) { mpDocument = pDocument; mpClusteringResult = pClusteringResult; this->mpDocument->setDistanceType(pParameters->dataType); // retrieve parameters: const CLARANSParameters *pParams = (const CLARANSParameters*)(pParameters); std::cout << "CLARANS parameters: numClust = " << pParams->numClust << ", numLocal = " << pParams->numLocal << ", maxNeighbor = " << pParams->maxNeighbor << std::endl; mpClusteringResult->deleteClustering(); // delete probably old clustering result const int nSamples = this->mpDocument->nParsedImages(); const int nClusts = pParams->numClust; if (nSamples < 2) { throw NoDataException("No data found for clustering!"); } if (pParameters->dataType == FEATURES_BASED && this->mpDocument->nFeatures() < 2) { throw NoDataException("No features found for clustering!"); } // get pointer to image chars: std::vector<ImageChar*> *imageCharVecPointer = this->mpDocument->getImageCharsVecPointer(); // INITIALIZATION: // srand(time(NULL)); // FIXME!!!!!!!!!!!!! double minCost = 1e32f; minCost = 1e32f; double actCost = 0.0f; double newCost = 0.0f; std::vector<int> currentNode; getRandomNode(nSamples, nClusts, currentNode); actCost = getNodeCost(nSamples, currentNode); std::cout << "Starting node: "; printNode(currentNode); // LOOP: int j = 1; j=1; std::vector<int> neighborNode; getRandomNeighbor(nSamples, currentNode, neighborNode); // determine a random neighbor node newCost = getNodeCost(nSamples, neighborNode); std::cout << "Finished CLARANS clustering!" << std::endl; return; } // end startClustering(...)
void terrama2::core::DataRetrieverFTP::retrieveDataCallback(const std::string& mask, const Filter& filter, const std::string& timezone, std::shared_ptr<terrama2::core::FileRemover> remover, const std::string& temporaryFolderUri, const std::string& foldersMask, std::function<void(const std::string &, const std::string &, const std::string&)> processFile) const { try { // find valid directories std::vector< std::string > baseUriList; baseUriList.push_back(dataProvider_->uri); auto tz = timezone.empty() ? "UTC+00" : timezone; if(!foldersMask.empty()) { auto uriList = getFoldersList(baseUriList, foldersMask, tz, filter); if(uriList.empty()) { QString errMsg = QObject::tr("No files found!"); TERRAMA2_LOG_WARNING() << errMsg; throw terrama2::core::NoDataException() << ErrorDescription(errMsg); } baseUriList = uriList; } // Build URI to get PATH fragment te::core::URI dataProviderURI(dataProvider_->uri); // Set temporary directory. When empty, creates a new pointing to dataProvider Path. // In this way, we will have something like "temporaryDir/dataProviderPath" // It is important due the folder may contains temporal mask std::string temporaryDataDir = temporaryFolderUri; if (temporaryFolderUri.empty()) temporaryDataDir = getTemporaryFolder(remover, temporaryFolderUri) + dataProviderURI.path(); // flag if there is any files for the dataset bool hasData = false; // Get a file listing from server for(const auto& uri : baseUriList) { std::vector<std::string> vectorFiles = curlwrapper_->listFiles(normalizeURI(uri)); std::vector<std::string> vectorNames; // filter file names that should be downloaded. for(const std::string& fileName: vectorFiles) { // FIXME: use timestamp std::shared_ptr< te::dt::TimeInstantTZ > timestamp; if(terrama2::core::isValidDataSetName(mask,filter, tz, fileName,timestamp)) vectorNames.push_back(fileName); } if(vectorNames.empty()) { continue; } hasData = true; te::core::URI u(uri); std::string uriPath = QString::fromStdString(u.path()).replace(dataProviderURI.path().c_str(), "/").toStdString(); // Performs the download of files in the vectorNames for(const auto& file: vectorNames) { // Create directory struct QString saveDir(QString::fromStdString(temporaryDataDir+ "/" + uriPath)); QString savePath = QUrl(saveDir).toLocalFile(); QDir dir(savePath); if(!dir.exists()) dir.mkpath(savePath); std::string uriOrigin = uri + "/" + file; std::string filePath = savePath.toStdString() + "/" + file; remover->addTemporaryFolder(temporaryDataDir); remover->addTemporaryFile(filePath); try { curlwrapper_->downloadFile(normalizeURI(uriOrigin).uri(), filePath); TERRAMA2_LOG_WARNING() << QObject::tr("Finished downloading file: %1").arg(QString::fromStdString(file)); processFile(temporaryDataDir, file, uriPath); } catch(const te::Exception& e) { QString errMsg = QObject::tr("Error during download of file %1.\n").arg(QString::fromStdString(file)); auto errStr = boost::get_error_info<te::ErrorDescription>(e); if(errStr) errMsg.append(QString::fromStdString(*errStr)); errMsg.append(e.what()); TERRAMA2_LOG_ERROR() << errMsg; throw DataRetrieverException() << ErrorDescription(errMsg); } } } if(!hasData) { QString errMsg = QObject::tr("No data in the remote server."); TERRAMA2_LOG_WARNING() << errMsg; throw NoDataException() << ErrorDescription(errMsg); } } catch(const NoDataException&) { throw; } catch(const DataRetrieverException&) { throw; } catch(const te::Exception& e) { QString errMsg = QObject::tr("Error during download.\n"); errMsg.append(boost::get_error_info<terrama2::ErrorDescription>(e)); errMsg.append(e.what()); TERRAMA2_LOG_ERROR() << errMsg; throw DataRetrieverException() << ErrorDescription(errMsg); } catch(const std::exception& e) { QString errMsg = QObject::tr("Error during download.\n"); errMsg.append(e.what()); TERRAMA2_LOG_ERROR() << errMsg; throw DataRetrieverException() << ErrorDescription(errMsg); } catch(...) { throw DataRetrieverException() << ErrorDescription(QObject::tr("Unknown Error.")); } }
void AgglomerativeClustering::startClustering(const ClusterMethodParameters* pParameters, Document *pDocument, ClusteringResult *pClusteringResult) { mpDocument = pDocument; mpClusteringResult = pClusteringResult; // preprocess glyphs or compute features, depending on distance type: if (pParameters->dataType == FEATURES_BASED) { pDocument->computeFeatures(); } else { pDocument->preprocessAllGlyphs(); } // this->mpDocument->setDistanceType(pParameters->dataType); // retrieve parameters: const AgglomerativeParameters *pParams = (const AgglomerativeParameters*)(pParameters); // print some debug info: std::cout << "Starting agglomerative clustering algorithm..." << std::endl; std::cout << "nr of clusters: " << pParams->nClusters << std::endl; std::cout << "feature dist type: " << pParams->featureDistType << std::endl; std::cout << "cluster dist type: " << pParams->clusterDistType << std::endl; mpClusteringResult->deleteClustering(); // delete probably old clustering result // setClusterDistanceTypeFunctionPointer(pParams->clusterDistType); const int nSamples = this->mpDocument->nParsedImages(); if (nSamples < 2) { throw NoDataException("No data found for clustering!"); } if (pParameters->dataType == FEATURES_BASED && this->mpDocument->nFeatures() < 2) { throw NoDataException("No features found for clustering!"); } StopWatch watch; // initialize distance matrix: std::cout << "computing distance matrix..." << std::endl; watch.start(); initDistanceMatrix(); watch.stop(); std::cout << "successfully computed distance matrix" << std::endl; // TEST: USING CLUSTER LIBRARY: double **tmpdistmatrix; // Allocate memory for distance matrix tmpdistmatrix = new double*[nSamples]; for (int i = 0; i < nSamples; ++i) tmpdistmatrix[i] = new double[nSamples]; // copy distance matrix for (int i=0;i<nSamples;++i){ for (int j=0; j<=i; ++j) { tmpdistmatrix[i][j] = mDistMat(i,j); tmpdistmatrix[j][i] = mDistMat(i,j); } } std::vector<int> labels(nSamples); #if 0 int npass = 100; double error; int ifound; watch.start(); kmedoids (pParams->nClusters, nSamples, tmpdistmatrix, npass, &labels[0], &error, &ifound); watch.stop(); std::cout << "finished kmedioids, error = " << error << ", ifound = " << ifound << std::endl; #else // method = 's' (single-linkage), 'm' (complete-linkage), 'a' (average-linkage) or 'c' (centroid-linkage): char methodChar='a'; switch (pParams->clusterDistType) { case AVG_DIST: methodChar='a'; break; case MIN_DIST: methodChar='s'; break; case MAX_DIST: methodChar='m'; break; default: throw Exception("Unknown distance type in AgglomerativeClustering::startClustering()"); break; } // end switch Node* tree = treecluster(nSamples, 1, NULL, NULL, NULL, 0, '_', methodChar, tmpdistmatrix); if (tree==NULL) std::cerr << "FATAL ERROR - NULL POINTER IN CLUSTER RESULT!" << std::endl; // cut hierarchical cluster tree at specified nr of clusters: cuttree(nSamples, tree, pParams->nClusters, &labels[0]); delete [] tree; #endif // De-Allocate memory for temporary distance matrix: for (int i = 0; i < nSamples; ++i) delete [] tmpdistmatrix[i]; delete [] tmpdistmatrix; std::cout << "finished agglo clustering with cluster libarary!!" << std::endl; pClusteringResult->createClusteringResultFromLabelVector(labels, pDocument); /////////////////// END TEST #if 0 // get pointer to image chars: std::vector<ImageChar*> *imageCharVecPointer = this->mpDocument->getImageCharsVecPointer(); // create cluster for each instance: CharCluster *pCluster=NULL; for (int i=0; i<nSamples; ++i) { pCluster = mpClusteringResult->addEmptyCluster(this->mpDocument); pCluster->addChar( (*imageCharVecPointer)[i] ); } std::cout << "Starting merging process..." << std::endl; // watch.start(); std::vector<float> minValVec; std::vector<int> nClustsVec; // while nr of clusters not reached -> merge two nearest clusters // while (mClusterVec.size() > pParams->nClusters) { while (mpClusteringResult->nClusters() > pParams->nClusters) { // find min element of distance matrix: minValVec.push_back(mMinDist); // nClustsVec.push_back(mClusterVec.size()); nClustsVec.push_back(mpClusteringResult->nClusters()); // std::cout << "nr of clusters is " << mClusterVec.size() << std::endl; // std::cout << "min dist is " << mMinDist << " on index " << mMinIndex << std::endl; watch.start(); updateClusterLabels(); std::cout << "updated cluster labels, time = " << watch.stop(false) << std::endl; watch.start(); findMinDist(); std::cout << "found min dist, time = " << watch.stop(false) << std::endl; std::cout << "new nr of clusters is " << mpClusteringResult->nClusters() << std::endl; } // end while // writeTxtFile(nClustsVec, "C:/projekte/impact/matlab_sebastian/c_prog_out/n_clusts_vec.txt"); // writeTxtFile(minValVec, "C:/projekte/impact/matlab_sebastian/c_prog_out/min_val_vec.txt"); // watch.stop(); #endif pDocument->clearAllPreprocessing(); pDocument->clearFeatures(); pClusteringResult->computePrototypeFeatures(); pClusteringResult->updatePrototypes(true); return; } // end startClustering()
void CharDataReaderI::parseFiles(Document *pDocument, std::vector<ImageChar> &imageCharVec) const { imageCharVec.clear(); ParsingParameters pars = pDocument->parsingParameters(); const ParseSubsetType pType = pars.parsingType; const SubsetGenerationType genType = pars.generationType; const ParsingMethodType mType = pars.parsingMethod; if (mType == UNKNOWN_PARSING_METHOD) { throw Exception("An invalid parsing method type was given!"); } if (pType == UNKNOWN_PARSESUBSET_TYPE) { throw Exception("An invalid parsing subset type was given!"); } if (genType == UNKNOWN_GENERATION_TYPE) { throw Exception("An invalid parsing subset generation type was given!"); } // load document images if not done yet: if (!pDocument->isDocumentImagesLoaded()) { throw NoDataException("No input documents opened for parsing!"); // std::cout << "Opening document images for parsing, parsing type is: " << pars.parsingMethod << std::endl; // pDocument->openDocumentImages(); } // create XML filenames: std::string fileExtension = ""; if (mType == FINEREADER_XML || mType == IM2CHARRECTS_XML) { fileExtension = "xml"; } else if (mType == DAT_FILES) { fileExtension = "dat"; } std::cout << "file extension = " << fileExtension << std::endl; pDocument->createSegmentationResultFileNames(fileExtension, pars.filenamePrefix, pars.filenameSuffix); // std::cout << "created segmentation result filenames" << std::endl; for (int i=0; i<pDocument->nFiles(); ++i) { std::cout << "Now parsing image file " << i << std::endl; // 1st: parse single image file, store result in tmp-vectors: std::vector<ImageChar> tmpImageCharVec; this->parseSingleImageFile(pDocument->fileName(i), pDocument->segmentationResultFileName(i), pDocument->imagePointer(i), tmpImageCharVec, i); // 2nd: sort out susp. / non-susp. if needed, store result in tmp-vectors: std::vector<ImageChar> tmpImageCharVec2; for (int j=0; j<tmpImageCharVec.size(); ++j) { if ((pType == PARSE_ALL) || (pType == PARSE_ONLY_NON_SUSP && !tmpImageCharVec[j].suspicious) || (pType == PARSE_ONLY_SUSP && tmpImageCharVec[j].suspicious)) { tmpImageCharVec2.push_back(tmpImageCharVec[j]); } } // end for all parsed chars j // 3rd: generate subset of parsed chars according to generation type: tmpImageCharVec.clear(); double percentage = double(pars.subsetPerc) / 100.0f; int nCharsToPars = ceil(tmpImageCharVec2.size() * percentage); assert(nCharsToPars<=tmpImageCharVec2.size()); std::cout << "Parsing " << pars.subsetPerc << "% of " << tmpImageCharVec2.size() << " nr of chars = " << nCharsToPars << std::endl; if (genType == PARSE_RANDOMLY) { // take nCharsToPars nr of characters randomly from page std::vector<int> rand_tuple = RandomNumber::random_unique_integer_tuple(tmpImageCharVec2.size()); for (int k=0; k<nCharsToPars; k++) { imageCharVec.push_back(tmpImageCharVec2[rand_tuple[k]]); imageCharVec[imageCharVec.size()-1].id = k; } } else if (genType == PARSE_FIRST) { // take nCharsToPars nr of characters first from page for (int k=0; k<nCharsToPars; k++) { imageCharVec.push_back(tmpImageCharVec2[k]); imageCharVec[imageCharVec.size()-1].id = k; } } else if (genType == PARSE_LAST) { for (int l=0; l<nCharsToPars; l++) { int k = tmpImageCharVec2.size()-l-1; // std::cout << "k = " << k << std::endl; // std::cout << "(tmpImageCharVec2.size()-nCharsToPars) = " << (tmpImageCharVec2.size()-nCharsToPars) << std::endl; imageCharVec.push_back(tmpImageCharVec2[k]); imageCharVec[imageCharVec.size()-1].id = k; } } } // end for all files i return; }