Beispiel #1
0
void Print(const SparseMatrix<T>& M)
{
    // Print a SparseMatrix to the screen.

    const unsigned int* col_buf = M.LockedColBuffer();
    const unsigned int* row_buf = M.LockedRowBuffer();
    const T*                buf = M.LockedDataBuffer();

    if (0 == M.Size())
    {
        std::cout << "Matrix is empty." << std::endl;
        return;
    }

    for (unsigned int c=0; c != M.Width(); ++c)
    {
        unsigned int start = col_buf[c];
        unsigned int end   = col_buf[c+1];
        for (unsigned int offset=start; offset != end; ++offset)
        {
            assert(offset >= 0);
            assert(offset < M.Size());
            unsigned int row_index = row_buf[offset];
            T                 data = buf[offset];
            std::cout << "(" << row_index << ", " << c << "): " << data << std::endl;
        }
    }

    std::cout << "Col indices: "; std::cout.flush();
    for (unsigned int i=0; i != M.Width(); ++i)
        std::cout << col_buf[i] << ", ";
    std::cout << col_buf[M.Width()] << std::endl;

    std::cout << "Row indices: "; std::cout.flush();
    for (unsigned int i=0; i != M.Size(); ++i)
        std::cout << row_buf[i] << ", ";
    std::cout << std::endl;

    std::cout << "Data:        "; std::cout.flush();
    for (unsigned int i=0; i != M.Size(); ++i)
        std::cout << buf[i] << ", ";
    std::cout << std::endl;
}
Beispiel #2
0
T FrobeniusNorm(const SparseMatrix<T>& A)
{
    // compute the sum of the absolute value squared of each element
    const T*           data_a = A.LockedDataBuffer();
    const unsigned int size_a = A.Size();

    T sum = T(0);
    for (unsigned int i=0; i != size_a; ++i)
    {
        T val = fabs(data_a[i]);
        sum += val*val;
    }

    return sqrt(sum);
}
Beispiel #3
0
T MaxNorm(const SparseMatrix<T>& A)
{
    // find max( |A_ij| )
    const T*           data_a = A.LockedDataBuffer();
    const unsigned int size_a = A.Size();

    T max_norm = T(0);
    for (unsigned int i=0; i != size_a; ++i)
    {
        T val = fabs(data_a[i]);
        if (val > max_norm)
            max_norm = val;
    }

    return max_norm;
}
Beispiel #4
0
bool WriteMatrixMarketFile(const std::string& file_path,
                           const SparseMatrix<T>& A,
                           const unsigned int precision)
{
    // Write a MatrixMarket file with no comments.  Note that the
    // MatrixMarket format uses 1-based indexing for rows and columns.

    std::ofstream outfile(file_path);
    if (!outfile)
        return false;

    unsigned int height = A.Height();
    unsigned int width  = A.Width();
    unsigned int nnz    = A.Size();
    
    // write the 'banner'
    outfile << MM_BANNER << " matrix coordinate real general" << std::endl;

    // write matrix dimensions and number of nonzeros
    outfile << height << " " << width << " " << nnz << std::endl;

    outfile << std::fixed;
    outfile.precision(precision);
    
    const unsigned int* cols_a = A.LockedColBuffer();
    const unsigned int* rows_a = A.LockedRowBuffer();
    const T*            data_a = A.LockedDataBuffer();
    unsigned int width_a = A.Width();

    for (unsigned int c=0; c != width_a; ++c)
    {
        unsigned int start = cols_a[c];
        unsigned int end   = cols_a[c+1];
        for (unsigned int offset=start; offset != end; ++offset)
        {
            unsigned int r = rows_a[offset];
            T val = data_a[offset];
            outfile << r+1 << " " << c+1 << " " << val << std::endl;
        }
    }

    outfile.close();
    return true;
}
Beispiel #5
0
//-----------------------------------------------------------------------------
void Nmf(const unsigned int kval, 
         const Algorithm algorithm,
         const std::string& csv_file_w,
         const std::string& csv_file_h)
{
    if (!matrix_loaded)
        throw std::logic_error("smallk error (NMF): no matrix has been loaded.");

    if (max_iter < min_iter)
        throw std::logic_error("smallk error (NMF): min_iterations exceeds max_iterations.");

    if (0 == kval)
        throw std::logic_error("smallk error (NMF): k must be greater than 0.");

    // Check the sizes of matrix W(m, k) and matrix H(k, n) and make sure 
    // they don't overflow Elemental's default signed int index type.

    if (!SizeCheck<int>(m, kval))
        throw std::logic_error("smallk error (Nmf): mxk matrix W is too large.");
    
    if (!SizeCheck<int>(kval, n))
        throw std::logic_error("smallk error (Nmf): kxn matrix H is too large.");

    k = kval;

    // convert to the 'NmfAlgorithm' type in nmf.hpp
    switch (algorithm)
    {
    case Algorithm::MU:
        nmf_opts.algorithm = NmfAlgorithm::MU;
        break;
    case Algorithm::HALS:
        nmf_opts.algorithm = NmfAlgorithm::HALS;
        break;
    case Algorithm::RANK2:
        nmf_opts.algorithm = NmfAlgorithm::RANK2;
        break;
    case Algorithm::BPP:
        nmf_opts.algorithm = NmfAlgorithm::BPP;
        break;
    default:
        throw std::logic_error("smallk error (NMF): unknown NMF algorithm.");
    }

    // set k == 2 for Rank2 algorithm
    if (NmfAlgorithm::RANK2 == nmf_opts.algorithm)
        k = 2;

    ldim_w = m;
    ldim_h = k;

    if (buf_w.size() < m*k)
        buf_w.resize(m*k);
    if (buf_h.size() < k*n)
        buf_h.resize(k*n);
    
    // initialize matrices W and H
    bool ok;
    unsigned int height_w = m, width_w = k, height_h = k, width_h = n;

    cout << "Initializing matrix W..." << endl;
    if (csv_file_w.empty())
        ok = RandomMatrix(&buf_w[0], ldim_w, m, k, rng);
    else
        ok = LoadDelimitedFile(buf_w, height_w, width_w, csv_file_w);
    if (!ok)
    {
        std::ostringstream msg;
        msg << "smallk error (Nmf): load failed for file ";
        msg << "\"" << csv_file_w << "\"";
        throw std::runtime_error(msg.str());
    }

    if ( (height_w != m) || (width_w != k))
    {
        cerr << "\tdimensions of matrix W are " << height_w
             << " x " << width_w << endl;
        cerr << "\texpected " << m << " x " << k << endl;
        throw std::logic_error("smallk error (Nmf): non-conformant matrix W.");
    }

    cout << "Initializing matrix H..." << endl;
    if (csv_file_h.empty())
        ok = RandomMatrix(&buf_h[0], ldim_h, k, n, rng);
    else
        ok = LoadDelimitedFile(buf_h, height_h, width_h, csv_file_h);

    if (!ok)
    {
        std::ostringstream msg;
        msg << "smallk error (Nmf): load failed for file ";
        msg << "\"" << csv_file_h << "\"";
        throw std::runtime_error(msg.str());
    }
    
    if ( (height_h != k) || (width_h != n))
    {
        cerr << "\tdimensions of matrix H are " << height_h
             << " x " << width_h << endl;
        cerr << "\texpected " << k << " x " << n << endl;
        throw std::logic_error("smallk error (Nmf): non-conformant matrix H.");
    }    

    // The ratio of projected gradient norms doesn't seem to work very well
    // with MU.  We frequently observe a 'leveling off' behavior and the 
    // convergence is even slower than usual.  So for MU use the relative
    // change in the Frobenius norm of W as the stopping criterion, which
    // always seems to behave well, even though it is on shaky theoretical
    // ground.

    if (NmfAlgorithm::MU == nmf_opts.algorithm)
        nmf_opts.prog_est_algorithm = NmfProgressAlgorithm::DELTA_FNORM;
    else
        nmf_opts.prog_est_algorithm = NmfProgressAlgorithm::PG_RATIO;

    nmf_opts.tol         = nmf_tolerance;
    nmf_opts.height      = m;
    nmf_opts.width       = n;
    nmf_opts.k           = k;
    nmf_opts.min_iter    = min_iter;
    nmf_opts.max_iter    = max_iter;
    nmf_opts.tolcount    = 1;
    nmf_opts.max_threads = max_threads;
    nmf_opts.verbose     = true;
    nmf_opts.normalize   = true;

    // display all params to user
    PrintNmfOpts(nmf_opts);

    NmfStats stats;
    Result result;
    if (is_sparse)
    {
        result = NmfSparse(nmf_opts, 
                           A.Height(), A.Width(), A.Size(),
                           A.LockedColBuffer(),
                           A.LockedRowBuffer(),
                           A.LockedDataBuffer(),
                           &buf_w[0], ldim_w,
                           &buf_h[0], ldim_h,
                           stats);
    }
    else
    {
        result = Nmf(nmf_opts,
                     &buf_a[0], ldim_a,
                     &buf_w[0], ldim_w,
                     &buf_h[0], ldim_h,
                     stats);
    }

    cout << "Elapsed wall clock time: ";
    cout << ElapsedTime(stats.elapsed_us) << endl;
    cout << endl;

    if (Result::OK != result)
        throw std::runtime_error("smallk error (Nmf): NMF solver failure.");

    // write the computed W and H factors to disk

    std::string outfile_w, outfile_h;
    if (outdir.empty())
    {
        outfile_w = DEFAULT_FILENAME_W;
        outfile_h = DEFAULT_FILENAME_H;
    }
    else
    {
        outfile_w = outdir + DEFAULT_FILENAME_W;
        outfile_h = outdir + DEFAULT_FILENAME_H;
    }

    cout << "Writing output files..." << endl;
    
    if (!WriteDelimitedFile(&buf_w[0], ldim_w, m, k, outfile_w, outprecision))
        throw std::runtime_error("smallk error (Nmf): could not write W result.");
    
    if (!WriteDelimitedFile(&buf_h[0], ldim_h, k, n, outfile_h, outprecision))
        throw std::runtime_error("smallk error (Nmf): could not write H result.");
}
Beispiel #6
0
//-----------------------------------------------------------------------------
int main(int argc, char* argv[])
{
    Timer timer;

    Random rng;
    rng.SeedFromTime();
    //rng.SeedFromInt(78);

    CommandLineOptions opts;
    if (!ParseCommandLine(argc, argv, opts))
    {
        if (opts.show_help)
        {
            ShowHelp(argv[0]);
            return 0;
        }
        else
        {
            // command line error
            return -1;
        }
    }

    // validate command line options
    if (!IsValid(opts))
        return -1;

    NmfInitialize(argc, argv);

    bool ok = true;
    unsigned int m, n, nnz, ldim_a, num_clusters;

    //-------------------------------------------------------------------------
    //
    //                 load the dictionary file
    //
    //-------------------------------------------------------------------------
    
    if (opts.clust_opts.verbose)
        cout << "loading dictionary..." << endl;

    std::vector<std::string> dictionary;
    if (!LoadStringsFromFile(opts.dictfile, dictionary))
    {
        cerr << "\ncould not load dictionary file " << opts.dictfile << endl;
        NmfFinalize();
        return -1;
    }

    //-------------------------------------------------------------------------
    //
    //                 load matrix A, the data matrix
    //
    //-------------------------------------------------------------------------
    if (opts.clust_opts.verbose)
        cout << "loading matrix..." << endl;

    SparseMatrix<R> A;
    std::vector<R> buf_a;

    if (IsSparse(opts.infile_A))
    {
        if (!LoadSparseMatrix(opts.infile_A, A, m, n, nnz))
        {
            cerr << "\nload failed for file " << opts.infile_A << endl;
            NmfFinalize();
            return -1;
        }
    }
    else if (IsDense(opts.infile_A))
    {
        ok = LoadDenseMatrix(opts.infile_A, buf_a, m, n);
        if (!ok || (buf_a.size() < m*n))
        {
            cerr << "\nload failed for file " << opts.infile_A << endl;
            NmfFinalize();
            return -1;
        }
    }
    else
    {
        cerr << "\nunsupported file type: " << opts.infile_A << endl;
        NmfFinalize();
        return -1;
    }

    num_clusters = opts.clust_opts.num_clusters;

    // HierNMF2 requires W and H initializer matrices of dimensions
    //
    //    W: m x 2
    //    H: 2 x n
    //
    // Elemental uses a default signed 32-bit integer for its index type,
    // and it computes offsets into the data buffer with this same type.
    // The following lines check to see that the W and H matrices fit
    // within these limits.
    
    // check W matrix (2*m elements required)
    uint64_t required_size = static_cast<uint64_t>(m);
    required_size *= 2u;
    if (!FitsWithin<int>(required_size))
    {
        cerr << "W matrix size too large" << endl;
        NmfFinalize();
        return -1;
    }

    // check H matrix (2*n elements required)
    required_size = static_cast<uint64_t>(n);
    required_size *= 2u;
    if (!FitsWithin<int>(required_size))
    {
        cerr << "H matrix size too large" << endl;
        NmfFinalize();
        return -1;
    }

    opts.clust_opts.nmf_opts.height = m;
    opts.clust_opts.nmf_opts.width  = n;
    opts.clust_opts.nmf_opts.k      = 2;

    // leading dimensions for dense matrix A data buffer
    ldim_a = m;
    
    // print a summary of all options
    if (opts.clust_opts.verbose)
        PrintOpts(opts);

    //-------------------------------------------------------------------------
    //
    //                 run the selected clustering algorithm
    //
    //-------------------------------------------------------------------------

    // W and H buffer for flat clustering
    std::vector<R> buf_w(m*num_clusters);
    std::vector<R> buf_h(num_clusters*n);

    Tree<R> tree;
    ClustStats stats;
    std::vector<float> probabilities;
    std::vector<unsigned int> assignments_flat;
    std::vector<int> term_indices(opts.clust_opts.maxterms * num_clusters);
    Result result = Result::OK;

    timer.Start();

    if (A.Size() > 0)
    {
        result = ClustSparse(opts.clust_opts, A,
                             &buf_w[0], &buf_h[0], tree, stats, rng);
    }
    else
    {
        result = Clust(opts.clust_opts, &buf_a[0], ldim_a,
                       &buf_w[0], &buf_h[0], tree, stats, rng);
    }
    
    if (opts.clust_opts.flat)
    {
        // compute flat clustering assignments and top terms
        unsigned int k = num_clusters;
        ComputeFuzzyAssignments(probabilities, &buf_h[0], k, k, n);
        ComputeAssignments(assignments_flat, &buf_h[0], k, k, n);
        TopTerms(opts.clust_opts.maxterms, &buf_w[0], m, m, k, term_indices);        
    }
 
    timer.Stop();
    double elapsed = timer.ReportMilliseconds();

    cout << "\nElapsed wall clock time: ";
    if (elapsed < 1000.0)
        cout << elapsed << " ms." << endl;
    else
        cout << elapsed*0.001 << " s." << endl;

    int num_converged = stats.nmf_count - stats.max_count;
    cout << num_converged << "/" << stats.nmf_count << " factorizations"
         << " converged." << endl << endl;

    //-------------------------------------------------------------------------
    //
    //                 write results
    //
    //-------------------------------------------------------------------------

    if (Result::FAILURE == result)
    {
        cerr << "\nHierarchical clustering fatal error." << endl;
    }
    else
    {    
        if (opts.clust_opts.verbose)
            cout << "Writing output files..." << endl;

        if (!tree.WriteAssignments(opts.assignfile))
            cerr << "\terror writing assignments file" << endl;

        IHierclustWriter* writer = CreateHierclustWriter(opts.format);
        if (!tree.WriteTree(writer, opts.treefile, dictionary))
            cerr << "\terror writing factorization file" << endl;

        if (opts.clust_opts.flat && Result::FLATCLUST_FAILURE != result)
        {
            FlatClustWriteResults(opts.outdir, assignments_flat, probabilities,
                                  dictionary, term_indices, opts.format,
                                  opts.clust_opts.maxterms, n,
                                  opts.clust_opts.num_clusters);
        }

        delete writer;
    }

    NmfFinalize();
    return 0;
}