// =============================================================================
PyObject * Epetra_NumPyMultiVector::ExtractCopy() const
{
  return PyArray_NewCopy(array,NPY_ANYORDER);
}
static PyObject *
dotblas_matrixproduct(PyObject *dummy, PyObject *args)
{
    PyObject *op1, *op2;
    PyArrayObject *ap1=NULL, *ap2=NULL, *ret=NULL;
    int j, l, lda, ldb, ldc;
    int typenum, nd;
    intp ap1stride=0;
    intp dimensions[MAX_DIMS];
    intp numbytes;
    static const float oneF[2] = {1.0, 0.0};
    static const float zeroF[2] = {0.0, 0.0};
    static const double oneD[2] = {1.0, 0.0};
    static const double zeroD[2] = {0.0, 0.0};
    double prior1, prior2;
    PyTypeObject *subtype;
    PyArray_Descr *dtype;
    MatrixShape ap1shape, ap2shape;

    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) return NULL;

    /*
     * "Matrix product" using the BLAS.
     * Only works for float double and complex types.
     */

    typenum = PyArray_ObjectType(op1, 0);
    typenum = PyArray_ObjectType(op2, typenum);

    /* This function doesn't handle other types */
    if ((typenum != PyArray_DOUBLE && typenum != PyArray_CDOUBLE &&
            typenum != PyArray_FLOAT && typenum != PyArray_CFLOAT)) {
        return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct(op1, op2));
    }

    dtype = PyArray_DescrFromType(typenum);
    ap1 = (PyArrayObject *)PyArray_FromAny(op1, dtype, 0, 0, ALIGNED, NULL);
    if (ap1 == NULL) return NULL;
    Py_INCREF(dtype);
    ap2 = (PyArrayObject *)PyArray_FromAny(op2, dtype, 0, 0, ALIGNED, NULL);
    if (ap2 == NULL) goto fail;


    if ((ap1->nd > 2) || (ap2->nd > 2)) {
        /* This function doesn't handle dimensions greater than 2
           (or negative striding)  -- other
           than to ensure the dot function is altered
        */
        if (!altered) {
            /* need to alter dot product */
            PyObject *tmp1, *tmp2;
            tmp1 = PyTuple_New(0);
            tmp2 = dotblas_alterdot(NULL, tmp1);
            Py_DECREF(tmp1);
            Py_DECREF(tmp2);
        }
        ret = (PyArrayObject *)PyArray_MatrixProduct((PyObject *)ap1,
                (PyObject *)ap2);
        Py_DECREF(ap1);
        Py_DECREF(ap2);
        return PyArray_Return(ret);
    }

    if (_bad_strides(ap1)) {
        op1 = PyArray_NewCopy(ap1, PyArray_ANYORDER);
        Py_DECREF(ap1);
        ap1 = (PyArrayObject *)op1;
        if (ap1 == NULL) goto fail;
    }
    if (_bad_strides(ap2)) {
        op2 = PyArray_NewCopy(ap2, PyArray_ANYORDER);
        Py_DECREF(ap2);
        ap2 = (PyArrayObject *)op2;
        if (ap2 == NULL) goto fail;
    }
    ap1shape = _select_matrix_shape(ap1);
    ap2shape = _select_matrix_shape(ap2);

    if (ap1shape == _scalar || ap2shape == _scalar) {
        PyArrayObject *oap1, *oap2;
        oap1 = ap1;
        oap2 = ap2;
        /* One of ap1 or ap2 is a scalar */
        if (ap1shape == _scalar) { 		/* Make ap2 the scalar */
            PyArrayObject *t = ap1;
            ap1 = ap2;
            ap2 = t;
            ap1shape = ap2shape;
            ap2shape = _scalar;
        }

        if (ap1shape == _row) ap1stride = ap1->strides[1];
        else if (ap1->nd > 0) ap1stride = ap1->strides[0];

        if (ap1->nd == 0 || ap2->nd == 0) {
            intp *thisdims;
            if (ap1->nd == 0) {
                nd = ap2->nd;
                thisdims = ap2->dimensions;
            }
            else {
                nd = ap1->nd;
                thisdims = ap1->dimensions;
            }
            l = 1;
            for (j=0; j<nd; j++) {
                dimensions[j] = thisdims[j];
                l *= dimensions[j];
            }
        }
        else {
            l = oap1->dimensions[oap1->nd-1];

            if (oap2->dimensions[0] != l) {
                PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
                goto fail;
            }
            nd = ap1->nd + ap2->nd - 2;
            /* nd = 0 or 1 or 2 */
            /* If nd == 0 do nothing ... */
            if (nd == 1) {
                /* Either ap1->nd is 1 dim or ap2->nd is 1 dim
                   and the other is 2-dim */
                dimensions[0] = (oap1->nd == 2) ? oap1->dimensions[0] : oap2->dimensions[1];
                l = dimensions[0];
                /* Fix it so that dot(shape=(N,1), shape=(1,))
                   and dot(shape=(1,), shape=(1,N)) both return
                   an (N,) array (but use the fast scalar code)
                */
            }
            else if (nd == 2) {
                dimensions[0] = oap1->dimensions[0];
                dimensions[1] = oap2->dimensions[1];
                /* We need to make sure that dot(shape=(1,1), shape=(1,N))
                   and dot(shape=(N,1),shape=(1,1)) uses
                   scalar multiplication appropriately
                */
                if (ap1shape == _row) l = dimensions[1];
                else l = dimensions[0];
            }
        }
    }
    else { /* (ap1->nd <= 2 && ap2->nd <= 2) */
        /*  Both ap1 and ap2 are vectors or matrices */
        l = ap1->dimensions[ap1->nd-1];

        if (ap2->dimensions[0] != l) {
            PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
            goto fail;
        }
        nd = ap1->nd+ap2->nd-2;

        if (nd == 1)
            dimensions[0] = (ap1->nd == 2) ? ap1->dimensions[0] : ap2->dimensions[1];
        else if (nd == 2) {
            dimensions[0] = ap1->dimensions[0];
            dimensions[1] = ap2->dimensions[1];
        }
    }

    /* Choose which subtype to return */
    if (ap1->ob_type != ap2->ob_type) {
        prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
        prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
        subtype = (prior2 > prior1 ? ap2->ob_type : ap1->ob_type);
    }
    else {
        prior1 = prior2 = 0.0;
        subtype = ap1->ob_type;
    }

    ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
                                       typenum, NULL, NULL, 0, 0,
                                       (PyObject *)
                                       (prior2 > prior1 ? ap2 : ap1));

    if (ret == NULL) goto fail;
    numbytes = PyArray_NBYTES(ret);
    memset(ret->data, 0, numbytes);
    if (numbytes==0 || l == 0) {
        Py_DECREF(ap1);
        Py_DECREF(ap2);
        return PyArray_Return(ret);
    }


    if (ap2shape == _scalar) {
        /* Multiplication by a scalar -- Level 1 BLAS */
        /* if ap1shape is a matrix and we are not contiguous, then we can't
           just blast through the entire array using a single
           striding factor */
        NPY_BEGIN_ALLOW_THREADS

        if (typenum == PyArray_DOUBLE) {
            if (l == 1) {
                *((double *)ret->data) = *((double *)ap2->data) * \
                                         *((double *)ap1->data);
            }
            else if (ap1shape != _matrix) {
                cblas_daxpy(l, *((double *)ap2->data), (double *)ap1->data,
                            ap1stride/sizeof(double), (double *)ret->data, 1);
            }
            else {
                int maxind, oind, i, a1s, rets;
                char *ptr, *rptr;
                double val;
                maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1);
                oind = 1-maxind;
                ptr = ap1->data;
                rptr = ret->data;
                l = ap1->dimensions[maxind];
                val = *((double *)ap2->data);
                a1s = ap1->strides[maxind] / sizeof(double);
                rets = ret->strides[maxind] / sizeof(double);
                for (i=0; i < ap1->dimensions[oind]; i++) {
                    cblas_daxpy(l, val, (double *)ptr, a1s,
                                (double *)rptr, rets);
                    ptr += ap1->strides[oind];
                    rptr += ret->strides[oind];
                }
            }
        }
        else if (typenum == PyArray_CDOUBLE) {
            if (l == 1) {
                cdouble *ptr1, *ptr2, *res;
                ptr1 = (cdouble *)ap2->data;
                ptr2 = (cdouble *)ap1->data;
                res = (cdouble *)ret->data;
                res->real = ptr1->real * ptr2->real - ptr1->imag * ptr2->imag;
                res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
            }
            else if (ap1shape != _matrix) {
                cblas_zaxpy(l, (double *)ap2->data, (double *)ap1->data,
                            ap1stride/sizeof(cdouble), (double *)ret->data, 1);
            }
            else {
                int maxind, oind, i, a1s, rets;
                char *ptr, *rptr;
                double *pval;
                maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1);
                oind = 1-maxind;
                ptr = ap1->data;
                rptr = ret->data;
                l = ap1->dimensions[maxind];
                pval = (double *)ap2->data;
                a1s = ap1->strides[maxind] / sizeof(cdouble);
                rets = ret->strides[maxind] / sizeof(cdouble);
                for (i=0; i < ap1->dimensions[oind]; i++) {
                    cblas_zaxpy(l, pval, (double *)ptr, a1s,
                                (double *)rptr, rets);
                    ptr += ap1->strides[oind];
                    rptr += ret->strides[oind];
                }
            }
        }
        else if (typenum == PyArray_FLOAT) {
            if (l == 1) {
                *((float *)ret->data) = *((float *)ap2->data) * \
                                        *((float *)ap1->data);
            }
            else if (ap1shape != _matrix) {
                cblas_saxpy(l, *((float *)ap2->data), (float *)ap1->data,
                            ap1stride/sizeof(float), (float *)ret->data, 1);
            }
            else {
                int maxind, oind, i, a1s, rets;
                char *ptr, *rptr;
                float val;
                maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1);
                oind = 1-maxind;
                ptr = ap1->data;
                rptr = ret->data;
                l = ap1->dimensions[maxind];
                val = *((float *)ap2->data);
                a1s = ap1->strides[maxind] / sizeof(float);
                rets = ret->strides[maxind] / sizeof(float);
                for (i=0; i < ap1->dimensions[oind]; i++) {
                    cblas_saxpy(l, val, (float *)ptr, a1s,
                                (float *)rptr, rets);
                    ptr += ap1->strides[oind];
                    rptr += ret->strides[oind];
                }
            }
        }
        else if (typenum == PyArray_CFLOAT) {
            if (l == 1) {
                cfloat *ptr1, *ptr2, *res;
                ptr1 = (cfloat *)ap2->data;
                ptr2 = (cfloat *)ap1->data;
                res = (cfloat *)ret->data;
                res->real = ptr1->real * ptr2->real - ptr1->imag * ptr2->imag;
                res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
            }
            else if (ap1shape != _matrix) {
                cblas_caxpy(l, (float *)ap2->data, (float *)ap1->data,
                            ap1stride/sizeof(cfloat), (float *)ret->data, 1);
            }
            else {
                int maxind, oind, i, a1s, rets;
                char *ptr, *rptr;
                float *pval;
                maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1);
                oind = 1-maxind;
                ptr = ap1->data;
                rptr = ret->data;
                l = ap1->dimensions[maxind];
                pval = (float *)ap2->data;
                a1s = ap1->strides[maxind] / sizeof(cfloat);
                rets = ret->strides[maxind] / sizeof(cfloat);
                for (i=0; i < ap1->dimensions[oind]; i++) {
                    cblas_caxpy(l, pval, (float *)ptr, a1s,
                                (float *)rptr, rets);
                    ptr += ap1->strides[oind];
                    rptr += ret->strides[oind];
                }
            }
        }
        NPY_END_ALLOW_THREADS
    }