/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysds.runtime.matrix.data;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
import org.apache.sysds.runtime.compress.DMLCompressionException;
import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysds.runtime.data.Block;
import org.apache.sysds.runtime.data.DenseBlock;
import org.apache.sysds.runtime.data.DenseBlockFactory;
import org.apache.sysds.runtime.data.SparseBlock;
import org.apache.sysds.runtime.data.SparseBlockCSR;
import org.apache.sysds.runtime.data.SparseBlockMCSR;
import org.apache.sysds.runtime.data.SparseRow;
import org.apache.sysds.runtime.data.SparseRowVector;
import org.apache.sysds.runtime.functionobjects.DiagIndex;
import org.apache.sysds.runtime.functionobjects.RevIndex;
import org.apache.sysds.runtime.functionobjects.SortIndex;
import org.apache.sysds.runtime.functionobjects.SwapIndex;
import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
import org.apache.sysds.runtime.matrix.data.IJV;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
import org.apache.sysds.runtime.matrix.data.MatrixValue;
import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
import org.apache.sysds.runtime.meta.DataCharacteristics;
import org.apache.sysds.runtime.util.CommonThreadPool;
import org.apache.sysds.runtime.util.DataConverter;
import org.apache.sysds.runtime.util.SortUtils;
import org.apache.sysds.runtime.util.UtilFunctions;

public class LibMatrixReorg {
    protected static final Log LOG = LogFactory.getLog((String)LibMatrixReorg.class.getName());
    public static long PAR_NUMCELL_THRESHOLD = 0x100000L;
    public static final int PAR_NUMCELL_THRESHOLD_SORT = 1024;
    public static final boolean SHALLOW_COPY_REORG = true;
    public static final boolean SPARSE_OUTPUTS_IN_CSR = true;
    private static ThreadLocal<double[]> memPool = new ThreadLocal<double[]>(){

        @Override
        protected double[] initialValue() {
            return null;
        }
    };

    private LibMatrixReorg() {
    }

    public static boolean isSupportedReorgOperator(ReorgOperator op) {
        return LibMatrixReorg.getReorgType(op) != ReorgType.INVALID;
    }

    public static MatrixBlock reorg(MatrixBlock in, MatrixBlock out, ReorgOperator op) {
        ReorgType type = LibMatrixReorg.getReorgType(op);
        switch (type) {
            case TRANSPOSE: {
                if (op.getNumThreads() > 1) {
                    return LibMatrixReorg.transpose(in, out, op.getNumThreads());
                }
                return LibMatrixReorg.transpose(in, out);
            }
            case REV: {
                return LibMatrixReorg.rev(in, out);
            }
            case DIAG: {
                return LibMatrixReorg.diag(in, out);
            }
            case SORT: {
                SortIndex ix = (SortIndex)op.fn;
                if (op.getNumThreads() > 1) {
                    return LibMatrixReorg.sort(in, out, ix.getCols(), ix.getDecreasing(), ix.getIndexReturn(), op.getNumThreads());
                }
                return LibMatrixReorg.sort(in, out, ix.getCols(), ix.getDecreasing(), ix.getIndexReturn());
            }
        }
        throw new DMLRuntimeException("Unsupported reorg operator: " + op.fn);
    }

    public static MatrixBlock reorgInPlace(MatrixBlock in, ReorgOperator op) {
        ReorgType type = LibMatrixReorg.getReorgType(op);
        switch (type) {
            case TRANSPOSE: {
                return LibMatrixReorg.transposeInPlace(in, op.getNumThreads());
            }
            case REV: 
            case SORT: {
                throw new DMLRuntimeException("Not implemented inplace: " + op.fn.getClass().getSimpleName());
            }
        }
        throw new DMLRuntimeException("Unsupported inplace reorg operator: " + op.fn.getClass().getSimpleName());
    }

    public static MatrixBlock transpose(MatrixBlock in) {
        int clen = in.getNumColumns();
        int rlen = in.getNumRows();
        long nnz = in.getNonZeros();
        boolean sparseOut = MatrixBlock.evalSparseFormatInMemory(clen, rlen, nnz, true);
        return LibMatrixReorg.transpose(in, new MatrixBlock(clen, rlen, sparseOut));
    }

    public static MatrixBlock transpose(MatrixBlock in, MatrixBlock out) {
        boolean ultraSparse;
        if (in instanceof CompressedMatrixBlock) {
            throw new DMLCompressionException("Invalid call to transposed with a compressed matrix block");
        }
        if (in.isEmptyBlock(false)) {
            return out;
        }
        out.nonZeros = in.nonZeros;
        if (!(in.sparse || out.sparse || in.rlen != 1 && in.clen != 1)) {
            out.denseBlock = DenseBlockFactory.createDenseBlock(in.getDenseBlockValues(), in.clen, in.rlen);
            return out;
        }
        if (out.sparse) {
            out.allocateSparseRowsBlock(false);
        } else {
            out.allocateDenseBlock(false);
        }
        boolean bl = ultraSparse = in.sparse && out.sparse && in.nonZeros < (long)Math.max(in.rlen, in.clen);
        if (!in.sparse && !out.sparse) {
            LibMatrixReorg.transposeDenseToDense(in, out, 0, in.rlen, 0, in.clen);
        } else if (ultraSparse) {
            LibMatrixReorg.transposeUltraSparse(in, out);
        } else if (in.sparse && out.sparse) {
            LibMatrixReorg.transposeSparseToSparse(in, out, 0, in.rlen, 0, in.clen, LibMatrixReorg.countNnzPerColumn(in, 4096));
        } else if (in.sparse) {
            LibMatrixReorg.transposeSparseToDense(in, out, 0, in.rlen, 0, in.clen);
        } else {
            LibMatrixReorg.transposeDenseToSparse(in, out);
        }
        return out;
    }

    public static MatrixBlock transpose(MatrixBlock in, int k) {
        return LibMatrixReorg.transpose(in, k, false);
    }

    public static MatrixBlock transpose(MatrixBlock in, int k, boolean allowCSR) {
        int clen = in.getNumColumns();
        int rlen = in.getNumRows();
        long nnz = in.getNonZeros();
        boolean sparseOut = MatrixBlock.evalSparseFormatInMemory(clen, rlen, nnz, allowCSR);
        return LibMatrixReorg.transpose(in, new MatrixBlock(clen, rlen, sparseOut), k, allowCSR);
    }

    public static MatrixBlock transpose(MatrixBlock in, MatrixBlock out, int k) {
        return LibMatrixReorg.transpose(in, out, k, false);
    }

    public static MatrixBlock transpose(MatrixBlock in, MatrixBlock out, int k, boolean allowCSR) {
        if (in.isEmptyBlock(false) || (long)in.rlen * (long)in.clen < PAR_NUMCELL_THRESHOLD || k <= 1 || !in.sparse && !out.sparse && (in.rlen == 1 || in.clen == 1) || in.sparse && !out.sparse && in.rlen == 1 || !in.sparse && out.sparse && in.rlen == 1 || in.sparse && out.sparse && in.nonZeros < (long)Math.max(in.rlen, in.clen)) {
            return LibMatrixReorg.transpose(in, out);
        }
        out.nonZeros = in.nonZeros;
        if (!in.sparse && out.sparse) {
            LibMatrixReorg.transposeDenseToSparse(in, out, k);
            return out;
        }
        allowCSR = allowCSR && (in.clen <= 4096 || out.nonZeros < 10000000L);
        int[] cnt = null;
        ExecutorService pool = CommonThreadPool.get(k);
        try {
            if (out.sparse && allowCSR) {
                int size = (int)out.nonZeros;
                Future<int[]> f = LibMatrixReorg.countNNZColumns(in, k, pool);
                out.sparseBlock = new SparseBlockCSR(in.getNumColumns(), size, size);
                int[] outPtr = ((SparseBlockCSR)out.sparseBlock).rowPointers();
                cnt = f.get();
                for (int i = 0; i < cnt.length; ++i) {
                    outPtr[i + 1] = outPtr[i] + cnt[i];
                    cnt[i] = outPtr[i];
                }
            } else if (out.sparse) {
                out.allocateSparseRowsBlock(false);
            } else {
                out.allocateDenseBlock(false);
            }
            ArrayList<TransposeTask> tasks = new ArrayList<TransposeTask>();
            boolean allowReturnBlock = out.sparse && in.sparse && in.rlen >= in.clen && cnt == null;
            boolean row = !(!in.sparse && in.rlen < in.clen || out.sparse && !allowReturnBlock);
            int len = row ? in.rlen : in.clen;
            int blklen = (int)Math.ceil((double)len / (double)k);
            blklen = in.sparse ? Math.max(blklen, 32) : (blklen += !out.sparse && blklen % 8 != 0 ? 8 - blklen % 8 : 0);
            int i = 0;
            while (i < k & i * blklen < len) {
                tasks.add(new TransposeTask(in, out, row, i * blklen, Math.min((i + 1) * blklen, len), cnt, allowReturnBlock));
                ++i;
            }
            ArrayList<MatrixBlock> blocks = allowReturnBlock ? new ArrayList<MatrixBlock>() : null;
            for (Future task : pool.invokeAll(tasks)) {
                MatrixBlock m = (MatrixBlock)task.get();
                if (!allowReturnBlock || m == null) continue;
                blocks.add(m);
            }
            if (allowReturnBlock) {
                LibMatrixReorg.combine(blocks, out, row, k);
            }
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        finally {
            pool.shutdown();
        }
        return out;
    }

    private static void combine(List<MatrixBlock> blocks, MatrixBlock out, boolean row, int k) {
        MatrixBlock.append(blocks, out, row, k);
    }

    public static Future<int[]> countNNZColumns(MatrixBlock in, int k, ExecutorService pool) throws InterruptedException, ExecutionException {
        List<Future<int[]>> rtasks = LibMatrixReorg.countNNZColumnsFuture(in, k, pool);
        return pool.submit(() -> {
            int[] cnt = null;
            for (Future rtask : rtasks) {
                cnt = LibMatrixReorg.mergeNnzCounts(cnt, (int[])rtask.get());
            }
            return cnt;
        });
    }

    public static List<Future<int[]>> countNNZColumnsFuture(MatrixBlock in, int k, ExecutorService pool) throws InterruptedException {
        ArrayList<CountNnzTask> tasks = new ArrayList<CountNnzTask>();
        int blklen = (int)Math.ceil((double)in.rlen / (double)k);
        int i = 0;
        while (i < k & i * blklen < in.rlen) {
            tasks.add(new CountNnzTask(in, i * blklen, Math.min((i + 1) * blklen, in.rlen)));
            ++i;
        }
        return pool.invokeAll(tasks);
    }

    public static MatrixBlock transposeInPlace(MatrixBlock in, int k) {
        MatrixBlock out = null;
        if (in.isEmpty()) {
            out = new MatrixBlock(in.getNumColumns(), in.getNumRows(), true);
        } else if (in.isInSparseFormat()) {
            out = LibMatrixReorg.transpose(in, new MatrixBlock(in.getNumColumns(), in.getNumRows(), true), k, true);
        } else {
            LibMatrixReorg.transposeInPlaceDense(in, k);
            out = in;
        }
        return out;
    }

    public static MatrixBlock rev(MatrixBlock in, MatrixBlock out) {
        if (in.isEmptyBlock(false)) {
            return out;
        }
        if (in.rlen == 1) {
            out.copy(in);
            return out;
        }
        if (in.sparse) {
            LibMatrixReorg.reverseSparse(in, out);
        } else {
            LibMatrixReorg.reverseDense(in, out);
        }
        return out;
    }

    public static void rev(IndexedMatrixValue in, long rlen, int blen, ArrayList<IndexedMatrixValue> out) {
        MatrixIndexes inix = in.getIndexes();
        MatrixBlock inblk = (MatrixBlock)in.getValue();
        MatrixBlock tmpblk = LibMatrixReorg.rev(inblk, new MatrixBlock(inblk.getNumRows(), inblk.getNumColumns(), inblk.isInSparseFormat()));
        if (rlen % (long)blen == 0L) {
            int nrblks = (int)Math.ceil((double)rlen / (double)blen);
            out.add(new IndexedMatrixValue(new MatrixIndexes((long)nrblks - inix.getRowIndex() + 1L, inix.getColumnIndex()), tmpblk));
        } else {
            long pos1 = rlen - UtilFunctions.computeCellIndex(inix.getRowIndex(), blen, tmpblk.getNumRows() - 1) + 1L;
            long pos2 = pos1 + (long)tmpblk.getNumRows() - 1L;
            int ipos1 = UtilFunctions.computeCellInBlock(pos1, blen);
            int iposCut = tmpblk.getNumRows() - ipos1 - 1;
            int blkix1 = (int)UtilFunctions.computeBlockIndex(pos1, blen);
            int blkix2 = (int)UtilFunctions.computeBlockIndex(pos2, blen);
            int blklen1 = UtilFunctions.computeBlockSize(rlen, blkix1, blen);
            int blklen2 = UtilFunctions.computeBlockSize(rlen, blkix2, blen);
            MatrixIndexes outix1 = new MatrixIndexes(blkix1, inix.getColumnIndex());
            MatrixBlock outblk1 = new MatrixBlock(blklen1, inblk.getNumColumns(), inblk.isInSparseFormat());
            MatrixBlock tmp1 = tmpblk.slice(0, iposCut);
            outblk1.leftIndexingOperations(tmp1, ipos1, ipos1 + tmp1.getNumRows() - 1, 0, tmpblk.getNumColumns() - 1, outblk1, MatrixObject.UpdateType.INPLACE_PINNED);
            out.add(new IndexedMatrixValue(outix1, outblk1));
            if (blkix1 != blkix2) {
                MatrixIndexes outix2 = new MatrixIndexes(blkix2, inix.getColumnIndex());
                MatrixBlock outblk2 = new MatrixBlock(blklen2, inblk.getNumColumns(), inblk.isInSparseFormat());
                MatrixBlock tmp2 = tmpblk.slice(iposCut + 1, tmpblk.getNumRows() - 1);
                outblk2.leftIndexingOperations(tmp2, 0, tmp2.getNumRows() - 1, 0, tmpblk.getNumColumns() - 1, outblk2, MatrixObject.UpdateType.INPLACE_PINNED);
                out.add(new IndexedMatrixValue(outix2, outblk2));
            }
        }
    }

    public static MatrixBlock diag(MatrixBlock in, MatrixBlock out) {
        if (in.isEmptyBlock(false)) {
            return out;
        }
        int rlen = in.rlen;
        int clen = in.clen;
        if (clen == 1) {
            LibMatrixReorg.diagV2M(in, out);
        } else if (rlen == clen) {
            LibMatrixReorg.diagM2V(in, out);
        } else {
            throw new DMLRuntimeException("Reorg diagM2V requires squared block input. (" + rlen + ", " + clen + ")");
        }
        return out;
    }

    public static MatrixBlock sort(MatrixBlock in, MatrixBlock out, int[] by, boolean desc, boolean ixret) {
        return LibMatrixReorg.sort(in, out, by, desc, ixret, 1);
    }

    public static MatrixBlock sort(MatrixBlock in, MatrixBlock out, int[] by, boolean desc, boolean ixret, int k) {
        int i;
        ArrayList<Callable<Object>> tasks;
        boolean sparse = in.isInSparseFormat();
        int rlen = in.rlen;
        int clen = in.clen;
        out.sparse = in.sparse && !ixret;
        long l = out.nonZeros = ixret ? (long)rlen : in.nonZeros;
        if (!LibMatrixReorg.isValidSortByList(by, clen)) {
            throw new DMLRuntimeException("Sort configuration issue: invalid orderby columns: " + Arrays.toString(by) + " (" + rlen + "x" + clen + " input).");
        }
        if (!ixret) {
            if (in.isEmptyBlock(false)) {
                return out;
            }
            if (!sparse && clen == 1) {
                out.copy(in);
                if (k > 1) {
                    Arrays.parallelSort(out.getDenseBlockValues());
                } else {
                    Arrays.sort(out.getDenseBlockValues());
                }
                if (desc) {
                    LibMatrixReorg.sortReverseDense(out);
                }
                return out;
            }
        } else if (in.isEmptyBlock(false)) {
            out.allocateDenseBlock(false);
            double[] c = out.getDenseBlockValues();
            for (int i2 = 0; i2 < rlen; ++i2) {
                c[i2] = i2 + 1;
            }
            return out;
        }
        int[] vix = new int[rlen];
        double[] values = new double[rlen];
        for (int i3 = 0; i3 < rlen; ++i3) {
            vix[i3] = i3;
            values[i3] = in.quickGetValue(i3, by[0] - 1);
        }
        if (k == 1 || rlen < 1024) {
            SortUtils.sortByValue(0, rlen, values, vix);
        } else {
            try {
                ExecutorService pool = CommonThreadPool.get(k);
                tasks = new ArrayList<Callable<Object>>();
                int blklen = (int)Math.ceil((double)rlen / (double)k);
                i = 0;
                while (i * blklen < rlen) {
                    int start = i * blklen;
                    int stop = Math.min(rlen, i * blklen + blklen);
                    tasks.add(new SortTask(start, stop, vix, values));
                    ++i;
                }
                CommonThreadPool.invokeAndShutdown(pool, tasks);
                LibMatrixReorg.mergeSortedBlocks(blklen, vix, values, k);
            }
            catch (Exception ex) {
                throw new DMLRuntimeException(ex);
            }
        }
        if (by.length > 1) {
            LibMatrixReorg.sortBySecondary(0, rlen, values, vix, in, by, 1);
        }
        if (desc) {
            LibMatrixReorg.sortReverseDense(vix);
            LibMatrixReorg.sortReverseDense(values);
        }
        LibMatrixReorg.sortIndexesStable(0, rlen, values, vix, in, by, 1);
        if (!ixret) {
            out.allocateBlock();
            ExecutorService pool = CommonThreadPool.get(k);
            tasks = new ArrayList();
            ArrayList<Integer> blklen = UtilFunctions.getBalancedBlockSizesDefault(rlen, k, false);
            int lb = 0;
            for (i = 0; i < blklen.size(); ++i) {
                tasks.add(new CopyTask(in, out, vix, lb, lb + blklen.get(i)));
                lb += blklen.get(i).intValue();
            }
            CommonThreadPool.invokeAndShutdown(pool, tasks);
        } else {
            out.allocateDenseBlock(false);
            DenseBlock c = out.getDenseBlock();
            for (int i4 = 0; i4 < rlen; ++i4) {
                c.set(i4, 0, vix[i4] + 1);
            }
        }
        return out;
    }

    public static MatrixBlock reshape(MatrixBlock in, MatrixBlock out, int rows, int cols, boolean rowwise) {
        int rlen = in.rlen;
        int clen = in.clen;
        if ((long)rlen * (long)clen != (long)rows * (long)cols) {
            throw new DMLRuntimeException("Reshape matrix requires consistent numbers of input/output cells (" + rlen + ":" + clen + ", " + rows + ":" + cols + ").");
        }
        if (rlen == rows && clen == cols) {
            out.copyShallow(in);
            return out;
        }
        out.sparse = MatrixBlock.evalSparseFormatInMemory(rows, cols, in.nonZeros);
        out.rlen = rows;
        out.clen = cols;
        out.nonZeros = in.nonZeros;
        if (!in.sparse && !out.sparse) {
            LibMatrixReorg.reshapeDense(in, out, rows, cols, rowwise);
        } else if (in.sparse && out.sparse) {
            LibMatrixReorg.reshapeSparse(in, out, rows, cols, rowwise);
        } else if (in.sparse) {
            LibMatrixReorg.reshapeSparseToDense(in, out, rows, cols, rowwise);
        } else {
            LibMatrixReorg.reshapeDenseToSparse(in, out, rows, cols, rowwise);
        }
        return out;
    }

    public static List<IndexedMatrixValue> reshape(IndexedMatrixValue in, DataCharacteristics mcIn, DataCharacteristics mcOut, boolean rowwise, boolean outputEmptyBlocks) {
        MatrixIndexes ixIn = in.getIndexes();
        MatrixBlock mbIn = (MatrixBlock)in.getValue();
        Collection<MatrixIndexes> rix = LibMatrixReorg.computeAllResultBlockIndexes(ixIn, mcIn, mcOut, mbIn, rowwise, outputEmptyBlocks);
        Map<MatrixIndexes, MatrixBlock> rblk = LibMatrixReorg.createAllResultBlocks(rix, mbIn.nonZeros, mcOut);
        long row_offset = (ixIn.getRowIndex() - 1L) * (long)mcIn.getBlocksize();
        long col_offset = (ixIn.getColumnIndex() - 1L) * (long)mcIn.getBlocksize();
        if (mbIn.sparse) {
            LibMatrixReorg.reshapeSparse(mbIn, row_offset, col_offset, rblk, mcIn, mcOut, rowwise);
        } else {
            LibMatrixReorg.reshapeDense(mbIn, row_offset, col_offset, rblk, mcIn, mcOut, rowwise);
        }
        return rblk.entrySet().stream().filter(e -> outputEmptyBlocks || !((MatrixBlock)e.getValue()).isEmptyBlock(false)).map(e -> {
            ((MatrixBlock)e.getValue()).examSparsity();
            return new IndexedMatrixValue((MatrixIndexes)e.getKey(), (MatrixValue)e.getValue());
        }).collect(Collectors.toList());
    }

    public static MatrixBlock rmempty(MatrixBlock in, MatrixBlock ret, boolean rows, boolean emptyReturn, MatrixBlock select) {
        if (in.isEmptyBlock(false) && select == null) {
            int n;
            int n2 = n = emptyReturn ? 1 : 0;
            if (rows) {
                ret.reset(n, in.clen, in.sparse);
            } else {
                ret.reset(in.rlen, n, in.sparse);
            }
            return ret;
        }
        if (select != null && select.nonZeros == (long)(rows ? in.rlen : in.clen)) {
            return in;
        }
        if (rows) {
            return LibMatrixReorg.removeEmptyRows(in, ret, select, emptyReturn);
        }
        return LibMatrixReorg.removeEmptyColumns(in, ret, select, emptyReturn);
    }

    public static void rmempty(IndexedMatrixValue data, IndexedMatrixValue offset, boolean rmRows, long len, long blen, ArrayList<IndexedMatrixValue> outList) {
        IndexedMatrixValue tmpIMV;
        MatrixBlock src;
        int i;
        long clen;
        long rlen;
        if (!(data.getValue() instanceof MatrixBlock) || !(offset.getValue() instanceof MatrixBlock)) {
            throw new DMLRuntimeException("Unsupported input data: expected " + MatrixBlock.class.getName() + " but got " + data.getValue().getClass().getName() + " and " + offset.getValue().getClass().getName());
        }
        if (rmRows && data.getValue().getNumRows() != offset.getValue().getNumRows() || !rmRows && data.getValue().getNumColumns() != offset.getValue().getNumColumns()) {
            throw new DMLRuntimeException("Dimension mismatch between input data and offsets: [" + data.getValue().getNumRows() + "x" + data.getValue().getNumColumns() + " vs " + offset.getValue().getNumRows() + "x" + offset.getValue().getNumColumns());
        }
        HashMap<MatrixIndexes, IndexedMatrixValue> out = new HashMap<MatrixIndexes, IndexedMatrixValue>();
        MatrixBlock linData = (MatrixBlock)data.getValue();
        MatrixBlock linOffset = (MatrixBlock)offset.getValue();
        MatrixIndexes tmpIx = new MatrixIndexes(-1L, -1L);
        if (rmRows) {
            rlen = len;
            clen = linData.getNumColumns();
            for (i = 0; i < linOffset.getNumRows(); ++i) {
                long rix = (long)linOffset.quickGetValue(i, 0);
                if (rix <= 0L || rix > rlen) continue;
                src = linData.slice(i, i, 0, (int)(clen - 1L), new MatrixBlock());
                long brix = (rix - 1L) / blen + 1L;
                long lbrix = (rix - 1L) % blen;
                tmpIx.setIndexes(brix, data.getIndexes().getColumnIndex());
                if (!out.containsKey(tmpIx)) {
                    tmpIMV = new IndexedMatrixValue(new MatrixIndexes(), new MatrixBlock());
                    tmpIMV.getIndexes().setIndexes(tmpIx);
                    ((MatrixBlock)tmpIMV.getValue()).reset((int)Math.min(blen, rlen - (brix - 1L) * blen), (int)clen);
                    out.put(tmpIMV.getIndexes(), tmpIMV);
                }
                ((MatrixBlock)((IndexedMatrixValue)out.get(tmpIx)).getValue()).copy((int)lbrix, (int)lbrix, 0, (int)clen - 1, src, false);
            }
        } else {
            rlen = linData.getNumRows();
            clen = len;
            for (i = 0; i < linOffset.getNumColumns(); ++i) {
                long cix = (long)linOffset.quickGetValue(0, i);
                if (cix <= 0L || cix > clen) continue;
                src = linData.slice(0, (int)(rlen - 1L), i, i, new MatrixBlock());
                long bcix = (cix - 1L) / blen + 1L;
                long lbcix = (cix - 1L) % blen;
                tmpIx.setIndexes(data.getIndexes().getRowIndex(), bcix);
                if (!out.containsKey(tmpIx)) {
                    tmpIMV = new IndexedMatrixValue(new MatrixIndexes(), new MatrixBlock());
                    tmpIMV.getIndexes().setIndexes(tmpIx);
                    ((MatrixBlock)tmpIMV.getValue()).reset((int)rlen, (int)Math.min(blen, clen - (bcix - 1L) * blen));
                    out.put(tmpIMV.getIndexes(), tmpIMV);
                }
                ((MatrixBlock)((IndexedMatrixValue)out.get(tmpIx)).getValue()).copy(0, (int)rlen - 1, (int)lbcix, (int)lbcix, src, false);
            }
        }
        for (IndexedMatrixValue imv : out.values()) {
            ((MatrixBlock)imv.getValue()).recomputeNonZeros();
            outList.add(imv);
        }
    }

    public static MatrixBlock rexpand(MatrixBlock in, MatrixBlock ret, double max, boolean rows, boolean cast, boolean ignore, int k) {
        return LibMatrixReorg.rexpand(in, ret, UtilFunctions.toInt(max), rows, cast, ignore, k);
    }

    public static MatrixBlock rexpand(MatrixBlock in, MatrixBlock ret, int max, boolean rows, boolean cast, boolean ignore, int k) {
        LibMatrixReorg.checkRexpand(in, ignore);
        if (in.isEmptyBlock(false)) {
            if (rows) {
                ret.reset(max, in.rlen, true);
            } else {
                ret.reset(in.rlen, max, true);
            }
            return ret;
        }
        if (rows) {
            return LibMatrixReorg.rexpandRows(in, ret, max, cast, ignore);
        }
        return LibMatrixReorg.rexpandColumns(in, ret, max, cast, ignore, k);
    }

    public static void checkRexpand(MatrixBlock in, boolean ignore) {
        if (!ignore && in.getNonZeros() < (long)in.getNumRows()) {
            throw new DMLRuntimeException("Invalid input w/ zeros for rexpand ignore=false (rlen=" + in.getNumRows() + ", nnz=" + in.getNonZeros() + ").");
        }
    }

    public static void rexpand(IndexedMatrixValue data, double max, boolean rows, boolean cast, boolean ignore, long blen, ArrayList<IndexedMatrixValue> outList) {
        MatrixIndexes ix = data.getIndexes();
        MatrixBlock in = (MatrixBlock)data.getValue();
        MatrixBlock tmp = LibMatrixReorg.rexpand(in, new MatrixBlock(), max, rows, cast, ignore, 1);
        if (rows) {
            int rl = 0;
            while (rl < tmp.getNumRows()) {
                MatrixBlock mb = tmp.slice(rl, (int)(Math.min((long)rl + blen, (long)tmp.getNumRows()) - 1L));
                outList.add(new IndexedMatrixValue(new MatrixIndexes((long)rl / blen + 1L, ix.getRowIndex()), mb));
                rl = (int)((long)rl + blen);
            }
        } else {
            int cl = 0;
            while (cl < tmp.getNumColumns()) {
                MatrixBlock mb = tmp.slice(0, tmp.getNumRows() - 1, cl, (int)(Math.min((long)cl + blen, (long)tmp.getNumColumns()) - 1L), new MatrixBlock());
                outList.add(new IndexedMatrixValue(new MatrixIndexes(ix.getRowIndex(), (long)cl / blen + 1L), mb));
                cl = (int)((long)cl + blen);
            }
        }
    }

    private static ReorgType getReorgType(ReorgOperator op) {
        if (op.fn instanceof SwapIndex) {
            return ReorgType.TRANSPOSE;
        }
        if (op.fn instanceof RevIndex) {
            return ReorgType.REV;
        }
        if (op.fn instanceof DiagIndex) {
            return ReorgType.DIAG;
        }
        if (op.fn instanceof SortIndex) {
            return ReorgType.SORT;
        }
        return ReorgType.INVALID;
    }

    private static void transposeDenseToDense(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu) {
        int m = in.rlen;
        int n = in.clen;
        int n2 = out.clen;
        DenseBlock a = in.getDenseBlock();
        DenseBlock c = out.getDenseBlock();
        if (m == 1 || n == 1) {
            int ix = rl + cl;
            int len = ru + cu - ix - 1;
            System.arraycopy(a.valuesAt(0), ix, c.valuesAt(0), ix, len);
        } else {
            int blocksizeI = 128;
            int blocksizeJ = 128;
            if (a.numBlocks() == 1 && c.numBlocks() == 1) {
                double[] avals = a.valuesAt(0);
                double[] cvals = c.valuesAt(0);
                for (int bi = rl; bi < ru; bi += 128) {
                    int bimin = Math.min(bi + 128, ru);
                    for (int bj = cl; bj < cu; bj += 128) {
                        int bjmin = Math.min(bj + 128, cu);
                        for (int i = bi; i < bimin; ++i) {
                            int aix = i * n + bj;
                            int cix = bj * n2 + i;
                            LibMatrixReorg.transposeRow(avals, cvals, aix, cix, n2, bjmin - bj);
                        }
                    }
                }
            } else {
                for (int bi = rl; bi < ru; bi += 128) {
                    int bimin = Math.min(bi + 128, ru);
                    for (int bj = cl; bj < cu; bj += 128) {
                        int bjmin = Math.min(bj + 128, cu);
                        for (int i = bi; i < bimin; ++i) {
                            double[] avals = a.values(i);
                            int aix = a.pos(i);
                            for (int j = bj; j < bjmin; ++j) {
                                c.set(j, i, avals[aix + j]);
                            }
                        }
                    }
                }
            }
        }
    }

    private static void transposeDenseToSparse(MatrixBlock in, MatrixBlock out) {
        LibMatrixReorg.transposeDenseToSparse(in, out, 1);
    }

    private static void transposeDenseToSparse(MatrixBlock in, MatrixBlock out, int k) {
        if (out.rlen == 1) {
            LibMatrixReorg.transposeDenseToSparseVV(in, out);
        } else {
            LibMatrixReorg.transposeDenseToSparseMM(in, out, k);
        }
    }

    private static void transposeDenseToSparseVV(MatrixBlock in, MatrixBlock out) {
        int m = in.rlen;
        DenseBlock a = in.getDenseBlock();
        out.allocateSparseRowsBlock(false);
        SparseBlock c = out.getSparseBlock();
        c.set(0, new SparseRowVector((int)in.nonZeros, a.valuesAt(0), m), false);
    }

    private static void transposeDenseToSparseMM(MatrixBlock in, MatrixBlock out, int k) {
        int m = in.rlen;
        int n = in.clen;
        int m2 = out.rlen;
        int n2 = out.clen;
        int ennz2 = (int)(in.nonZeros / (long)m2);
        DenseBlock a = in.getDenseBlock();
        SparseRow[] rows = new SparseRowVector[m2];
        for (int j = 0; j < m2; ++j) {
            rows[j] = new SparseRowVector(ennz2, n2);
        }
        if (k <= 1) {
            LibMatrixReorg.transposeDenseToSparseMMRange(a, (SparseRowVector[])rows, 0, m, 0, n);
        } else {
            ExecutorService pool = CommonThreadPool.get(k);
            try {
                ArrayList<TransposeDenseToSparseTask> tasks = new ArrayList<TransposeDenseToSparseTask>();
                int rbz = Math.max(1, m2 / k);
                for (int i = 0; i < m2; i += rbz) {
                    tasks.add(new TransposeDenseToSparseTask(a, (SparseRowVector[])rows, 0, m, i, Math.min(i + rbz, n)));
                }
                for (Future task : pool.invokeAll(tasks)) {
                    task.get();
                }
                pool.shutdown();
            }
            catch (Exception ex) {
                pool.shutdown();
                throw new DMLRuntimeException(ex);
            }
        }
        SparseBlockMCSR c = new SparseBlockMCSR(rows, false);
        out.setSparseBlock(c);
    }

    private static void transposeDenseToSparseMMRange(DenseBlock a, SparseRowVector[] rows, int rl, int ru, int cl, int cu) {
        int blocksizeI = 128;
        int blocksizeJ = 128;
        for (int bi = rl; bi < ru; bi += 128) {
            int bimin = Math.min(bi + 128, ru);
            for (int bj = cl; bj < cu; bj += 128) {
                int bjmin = Math.min(bj + 128, cu);
                for (int i = bi; i < bimin; ++i) {
                    double[] avals = a.values(i);
                    int aix = a.pos(i);
                    for (int j = bj; j < bjmin; ++j) {
                        rows[j].append(i, avals[aix + j]);
                    }
                }
            }
        }
    }

    private static void transposeUltraSparse(MatrixBlock in, MatrixBlock out) {
        Iterator<IJV> iter = in.getSparseBlockIterator();
        SparseBlock b = out.getSparseBlock();
        while (iter.hasNext()) {
            IJV cell = iter.next();
            b.append(cell.getJ(), cell.getI(), cell.getV());
        }
        out.setNonZeros(in.getNonZeros());
    }

    private static void transposeSparseToSparse(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu, int[] cnt) {
        if (rl > 0 || ru < in.rlen) {
            throw new RuntimeException("Unsupported row-parallel transposeSparseToSparse: " + rl + ", " + ru);
        }
        if (cu - cl == 1) {
            LibMatrixReorg.transposeSparseToSparseRow(in, out, rl, ru, cl, cnt);
        } else {
            LibMatrixReorg.transposeSparseToSparseBlock(in, out, rl, ru, cl, cu, cnt);
        }
    }

    private static void transposeSparseToSparseRow(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int[] cnt) {
        SparseBlock a = in.getSparseBlock();
        SparseBlock c = out.getSparseBlock();
        if (cnt[cl] > 0) {
            c.allocate(cl, cnt[cl]);
        }
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i)) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            for (int j = apos; j < apos + alen && aix[j] <= cl; ++j) {
                if (aix[j] != cl) continue;
                c.append(cl, i, avals[j]);
            }
        }
    }

    private static void transposeSparseToSparseBlock(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu, int[] cnt) {
        int i;
        int m2 = out.rlen;
        int n2 = out.clen;
        int ennz2 = (int)(in.nonZeros / (long)m2);
        SparseBlock a = in.getSparseBlock();
        SparseBlock c = out.getSparseBlock();
        if (cnt != null) {
            for (i = cl; i < cu; ++i) {
                if (cnt[i] <= 0) continue;
                c.allocate(i, cnt[i]);
            }
        } else {
            for (i = cl; i < cu; ++i) {
                c.allocate(i, Math.max(ennz2, 2), n2);
            }
        }
        long xsp = (long)in.rlen * (long)in.clen / in.nonZeros;
        int blocksizeI = Math.max(128, (int)(8L * xsp));
        int blocksizeJ = Math.max(128, (int)(8L * xsp));
        if (blocksizeJ * 2 > m2 && c instanceof SparseBlockMCSR) {
            LibMatrixReorg.transposeSparseToSparseBlockTallSkinny(a, (SparseBlockMCSR)c, blocksizeI, rl, ru, cl, cu);
        } else if (c instanceof SparseBlockMCSR) {
            LibMatrixReorg.transposeSparseToSparseBlockMCSR(a, (SparseBlockMCSR)c, blocksizeI, blocksizeJ, rl, ru, cl, cu);
        } else {
            LibMatrixReorg.transposeSparseToSparseBlockGeneric(a, c, blocksizeI, blocksizeJ, rl, ru, cl, cu);
        }
    }

    private static void transposeSparseToSparseBlockTallSkinny(SparseBlock a, SparseBlockMCSR c, int blocksizeI, int rl, int ru, int cl, int cu) {
        SparseRow[] sr = c.getRows();
        for (int i = rl; i < ru; ++i) {
            int j;
            if (a.isEmpty(i) || (j = a.posFIndexGTE(i, cl)) < 0) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            j += apos;
            while (j < apos + alen && aix[j] < cu) {
                sr[aix[j]].append(i, avals[j]);
                ++j;
            }
        }
    }

    private static void transposeSparseToSparseBlockMCSR(SparseBlock a, SparseBlockMCSR c, int blocksizeI, int blocksizeJ, int rl, int ru, int cl, int cu) {
        int[] ix = new int[Math.min(blocksizeI, ru - rl)];
        SparseRow[] sr = c.getRows();
        for (int bi = rl; bi < ru; bi += blocksizeI) {
            Arrays.fill(ix, 0);
            int bimin = Math.min(bi + blocksizeI, ru);
            if (cl > 0) {
                for (int i = bi; i < bimin; ++i) {
                    if (a.isEmpty(i)) continue;
                    int j = a.posFIndexGTE(i, cl);
                    ix[i - bi] = j >= 0 ? j : a.size(i);
                }
            }
            for (int bj = cl; bj < cu; bj += blocksizeJ) {
                int bjmin = Math.min(bj + blocksizeJ, cu);
                for (int i = bi; i < bimin; ++i) {
                    int j;
                    if (a.isEmpty(i)) continue;
                    int apos = a.pos(i);
                    int alen = a.size(i);
                    int[] aix = a.indexes(i);
                    double[] avals = a.values(i);
                    for (j = ix[i - bi] + apos; j < apos + alen && aix[j] < bjmin; ++j) {
                        sr[aix[j]] = sr[aix[j]].append(i, avals[j]);
                    }
                    ix[i - bi] = j - apos;
                }
            }
        }
    }

    private static void transposeSparseToSparseBlockGeneric(SparseBlock a, SparseBlock c, int blocksizeI, int blocksizeJ, int rl, int ru, int cl, int cu) {
        int[] ix = new int[Math.min(blocksizeI, ru - rl)];
        for (int bi = rl; bi < ru; bi += blocksizeI) {
            Arrays.fill(ix, 0);
            int bimin = Math.min(bi + blocksizeI, ru);
            if (cl > 0) {
                for (int i = bi; i < bimin; ++i) {
                    if (a.isEmpty(i)) continue;
                    int j = a.posFIndexGTE(i, cl);
                    ix[i - bi] = j >= 0 ? j : a.size(i);
                }
            }
            for (int bj = cl; bj < cu; bj += blocksizeJ) {
                int bjmin = Math.min(bj + blocksizeJ, cu);
                for (int i = bi; i < bimin; ++i) {
                    int j;
                    if (a.isEmpty(i)) continue;
                    int apos = a.pos(i);
                    int alen = a.size(i);
                    int[] aix = a.indexes(i);
                    double[] avals = a.values(i);
                    for (j = ix[i - bi] + apos; j < apos + alen && aix[j] < bjmin; ++j) {
                        c.append(aix[j], i, avals[j]);
                    }
                    ix[i - bi] = j - apos;
                }
            }
        }
    }

    private static void transposeSparseToSparseCSR(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu, int[] cnt) {
        if (rl > 0 || ru < in.rlen) {
            throw new RuntimeException("Unsupported row-parallel transposeSparseToSparse: " + rl + ", " + ru);
        }
        if (cu - cl == 1) {
            LibMatrixReorg.transposeSparseToSparseCSRSingleCol(in, out, rl, ru, cl, cu, cnt);
        } else if (in.getSparseBlock() instanceof SparseBlockCSR) {
            LibMatrixReorg.transposeSparseCSRToSparseCSRMultiCol(in, out, cl, cu, cnt);
        } else {
            LibMatrixReorg.transposeSparseToSparseCSRMultiCol(in, out, cl, cu, cnt);
        }
    }

    private static final void transposeSparseCSRToSparseCSRMultiCol(MatrixBlock in, MatrixBlock out, int cl, int cu, int[] cnt) {
        int rlen = in.rlen;
        SparseBlockCSR a = (SparseBlockCSR)in.getSparseBlock();
        SparseBlockCSR c = (SparseBlockCSR)out.getSparseBlock();
        long xsp = (long)rlen * (long)in.clen / in.nonZeros;
        int blocksizeI = Math.min(Math.max(128, (int)(8L * xsp)), 512);
        int[] ix = new int[Math.min(blocksizeI, rlen)];
        for (int bi = 0; bi < rlen; bi += blocksizeI) {
            LibMatrixReorg.transposeSparseCSRToSparseCSRMultiColBlock(bi, blocksizeI, rlen, cl, cu, ix, a, cnt, c);
        }
    }

    private static final void transposeSparseCSRToSparseCSRMultiColBlock(int bi, int blocksizeI, int rlen, int cl, int cu, int[] ix, SparseBlockCSR a, int[] cnt, SparseBlockCSR c) {
        int[] aix = a.indexes();
        double[] avals = a.values();
        int[] outIndexes = c.indexes();
        double[] outValues = c.values();
        int bimin = Math.min(bi + blocksizeI, rlen);
        if (cl > 0) {
            LibMatrixReorg.fillSkip(bi, bimin, a, cl, ix);
        } else {
            Arrays.fill(ix, 0);
        }
        for (int bj = cl; bj < cu; bj += blocksizeI) {
            LibMatrixReorg.transposeSparseCSRToSparseCSRMultiColBlockBlock(bi, bj, bimin, cu, blocksizeI, a, ix, aix, avals, outIndexes, outValues, cnt);
        }
    }

    private static final void transposeSparseCSRToSparseCSRMultiColBlockBlock(int bi, int bj, int bimin, int cu, int blocksizeI, SparseBlockCSR a, int[] ix, int[] aix, double[] avals, int[] outIndexes, double[] outValues, int[] cnt) {
        int bjmin = Math.min(bj + blocksizeI, cu);
        for (int i = bi; i < bimin; ++i) {
            int j;
            int apos = a.pos(i);
            int alen = a.size(i);
            for (j = ix[i - bi] + apos; j < apos + alen && aix[j] < bjmin; ++j) {
                int pointer = cnt[aix[j]];
                int n = aix[j];
                cnt[n] = cnt[n] + 1;
                outIndexes[pointer] = i;
                outValues[pointer] = avals[j];
            }
            ix[i - bi] = j - apos;
        }
    }

    private static final void fillSkip(int bi, int bimin, SparseBlockCSR a, int cl, int[] ix) {
        for (int i = bi; i < bimin; ++i) {
            int j = a.posFIndexGTE(i, cl);
            ix[i - bi] = j >= 0 ? j : a.size(i);
        }
    }

    private static final void transposeSparseToSparseCSRMultiCol(MatrixBlock in, MatrixBlock out, int cl, int cu, int[] cnt) {
        int rlen = in.rlen;
        SparseBlock a = in.getSparseBlock();
        SparseBlockCSR c = (SparseBlockCSR)out.getSparseBlock();
        int[] outIndexes = c.indexes();
        double[] outValues = c.values();
        long xsp = (long)rlen * (long)in.clen / in.nonZeros;
        int blocksizeI = Math.min(Math.max(128, (int)(8L * xsp)), 512);
        int[] ix = new int[Math.min(blocksizeI, rlen)];
        for (int bi = 0; bi < rlen; bi += blocksizeI) {
            Arrays.fill(ix, 0);
            int bimin = Math.min(bi + blocksizeI, rlen);
            if (cl > 0) {
                for (int i = bi; i < bimin; ++i) {
                    if (a.isEmpty(i)) continue;
                    int j = a.posFIndexGTE(i, cl);
                    ix[i - bi] = j >= 0 ? j : a.size(i);
                }
            }
            for (int bj = cl; bj < cu; bj += blocksizeI) {
                int bjmin = Math.min(bj + blocksizeI, cu);
                for (int i = bi; i < bimin; ++i) {
                    int j;
                    if (a.isEmpty(i)) continue;
                    int apos = a.pos(i);
                    int alen = a.size(i);
                    int[] aix = a.indexes(i);
                    double[] avals = a.values(i);
                    for (j = ix[i - bi] + apos; j < apos + alen && aix[j] < bjmin; ++j) {
                        int pointer = cnt[aix[j]];
                        int n = aix[j];
                        cnt[n] = cnt[n] + 1;
                        outIndexes[pointer] = i;
                        outValues[pointer] = avals[j];
                    }
                    ix[i - bi] = j - apos;
                }
            }
        }
    }

    private static void transposeSparseToSparseCSRSingleCol(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu, int[] cnt) {
        SparseBlock a = in.getSparseBlock();
        SparseBlockCSR c = (SparseBlockCSR)out.getSparseBlock();
        int[] outIndexes = c.indexes();
        double[] outValues = c.values();
        int i = 0;
        int end = c.size(cl) + c.pos(cl);
        int outPointer = cnt[cl];
        while (outPointer < end) {
            if (!a.isEmpty(i)) {
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                for (int j = apos; j < apos + alen && aix[j] <= cl; ++j) {
                    if (aix[j] != cl) continue;
                    outIndexes[outPointer] = i;
                    outValues[outPointer] = avals[j];
                    ++outPointer;
                }
            }
            ++i;
        }
    }

    private static void transposeSparseToDense(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu) {
        int m = in.rlen;
        int n = in.clen;
        SparseBlock a = in.getSparseBlock();
        DenseBlock c = out.getDenseBlock();
        if (m == 1) {
            int alen = a.size(0);
            int[] aix = a.indexes(0);
            double[] avals = a.values(0);
            double[] cvals = c.valuesAt(0);
            for (int j = 0; j < alen; ++j) {
                cvals[aix[j]] = avals[j];
            }
        } else {
            int blocksizeI = 128;
            int blocksizeJ = 128;
            int[] ix = new int[128];
            for (int bi = rl; bi < ru; bi += 128) {
                Arrays.fill(ix, 0);
                int bimin = Math.min(bi + 128, ru);
                for (int bj = 0; bj < n; bj += 128) {
                    int bjmin = Math.min(bj + 128, n);
                    int i = bi;
                    int iix = 0;
                    while (i < bimin) {
                        if (!a.isEmpty(i)) {
                            int j;
                            int apos = a.pos(i);
                            int alen = a.size(i);
                            int[] aix = a.indexes(i);
                            double[] avals = a.values(i);
                            for (j = ix[iix]; j < alen && aix[apos + j] < bjmin; ++j) {
                                c.set(aix[apos + j], i, avals[apos + j]);
                            }
                            ix[iix] = j;
                        }
                        ++i;
                        ++iix;
                    }
                }
            }
        }
    }

    private static void transposeInPlaceDense(MatrixBlock in, int k) {
        DenseBlock values = in.getDenseBlock();
        if (values.numBlocks() > 1) {
            throw new NotImplementedException("Not Implemented in place transpose with more than one block");
        }
        int cols = in.getNumRows();
        int rows = in.getNumColumns();
        if (cols == 1 || rows == 1) {
            values.setDims(new int[]{rows, cols});
            in.setNumColumns(cols);
            in.setNumRows(rows);
        } else if (cols == rows) {
            LibMatrixReorg.transposeInPlaceTrivial(in.getDenseBlockValues(), cols, k);
        } else if (cols < rows) {
            LibMatrixReorg.c2r(in, k);
            values.setDims(new int[]{rows, cols});
            in.setNumColumns(cols);
            in.setNumRows(rows);
        } else {
            values.setDims(new int[]{rows, cols});
            in.setNumColumns(cols);
            in.setNumRows(rows);
            LibMatrixReorg.r2c(in, k);
        }
    }

    private static void transposeInPlaceTrivial(double[] values, int rowAndCols, int k) {
        if (rowAndCols > 15) {
            try {
                ExecutorService pool = CommonThreadPool.get(k);
                ArrayList<TransposeInPlaceTrivialTask> tasks = new ArrayList<TransposeInPlaceTrivialTask>();
                int blklen = 128;
                int i = 0;
                while (i * blklen < rowAndCols) {
                    int j = i;
                    while (j * blklen < rowAndCols) {
                        tasks.add(new TransposeInPlaceTrivialTask(i * blklen, Math.min((i + 1) * blklen, rowAndCols), j * blklen, Math.min((j + 1) * blklen, rowAndCols), rowAndCols, values));
                        ++j;
                    }
                    ++i;
                }
                List rtasks = pool.invokeAll(tasks);
                pool.shutdown();
                for (Future rt : rtasks) {
                    rt.get();
                }
            }
            catch (InterruptedException | ExecutionException ex) {
                throw new DMLRuntimeException("Failed parallel transpose in place with equal number col and rows.", ex);
            }
        } else {
            for (int rowidx = 0; rowidx < rowAndCols; ++rowidx) {
                for (int colidx = rowidx + 1; colidx < rowAndCols; ++colidx) {
                    LibMatrixReorg.swap(values, rowidx * rowAndCols + colidx, colidx * rowAndCols + rowidx);
                }
            }
        }
    }

    private static void swap(double[] values, int from, int to) {
        double tmp = values[from];
        values[from] = values[to];
        values[to] = tmp;
    }

    private static void c2r(MatrixBlock in, int k) {
        int j;
        int blkz;
        double[] A = in.getDenseBlockValues();
        int m = in.getNumRows();
        int n = in.getNumColumns();
        int c = LibMatrixReorg.gcd(m, n);
        int a = m / c;
        int b = n / c;
        double[] tmp = memPool.get();
        if (tmp == null) {
            memPool.set(new double[Math.max(m, n)]);
            tmp = memPool.get();
        }
        ExecutorService pool = CommonThreadPool.get(k);
        ArrayList<Callable<Object>> tasks = new ArrayList<Callable<Object>>();
        if (c > 1) {
            if (m > 10 && n > 100) {
                try {
                    blkz = Math.max((n - c) / k, 1);
                    int j2 = c;
                    while (j2 * blkz < n) {
                        tasks.add(new rTask(A, j2 * blkz, Math.min((j2 + 1) * blkz, n), b, n, m));
                        ++j2;
                    }
                    for (Future rt : pool.invokeAll(tasks)) {
                        rt.get();
                    }
                    tasks.clear();
                }
                catch (InterruptedException | ExecutionException ex) {
                    throw new DMLRuntimeException("Failed parallel c2r transpose in column rotate step", ex);
                }
            } else {
                for (j = c; j < n; ++j) {
                    LibMatrixReorg.rj(tmp, A, j, b, n, m);
                }
            }
        }
        if (m > 10 && n > 100) {
            try {
                blkz = Math.max(m / k, 1);
                int i = 0;
                while (i * blkz < m) {
                    tasks.add(new dTask(A, i * blkz, Math.min((i + 1) * blkz, m), b, n, m));
                    ++i;
                }
                for (Future rt : pool.invokeAll(tasks)) {
                    rt.get();
                }
                tasks.clear();
            }
            catch (InterruptedException | ExecutionException ex) {
                throw new DMLRuntimeException("Failed parallel c2r transpose in row shuffle step", ex);
            }
        } else {
            for (int i = 0; i < m; ++i) {
                LibMatrixReorg.di(tmp, A, i, b, n, m);
            }
        }
        if (m > 10 && n > 100) {
            try {
                blkz = Math.max(n / k, 1);
                int j3 = 0;
                while (j3 * blkz < n) {
                    tasks.add(new sTask(A, j3 * blkz, Math.min((j3 + 1) * blkz, n), a, n, m));
                    ++j3;
                }
                for (Future rt : pool.invokeAll(tasks)) {
                    rt.get();
                }
                tasks.clear();
            }
            catch (InterruptedException | ExecutionException ex) {
                throw new DMLRuntimeException("Failed parallel c2r transpose in column shuffle", ex);
            }
        } else {
            for (j = 0; j < n; ++j) {
                LibMatrixReorg.sj(tmp, A, j, a, n, m);
            }
        }
        memPool.remove();
    }

    private static void rj(double[] tmp, double[] A, int j, int b, int n, int m) {
        int i;
        int part = j / b;
        for (i = 0; i < m; ++i) {
            int rj = (i + part) % m;
            tmp[i] = A[rj * n + j];
        }
        i = j;
        int off = 0;
        while (i < m * n) {
            A[i] = tmp[off];
            i += n;
            ++off;
        }
    }

    private static void di(double[] tmp, double[] A, int i, int b, int n, int m) {
        int off = i * n;
        int j = 0;
        while (j < n) {
            int dij = ((i + j / b) % m + j * m) % n;
            tmp[dij] = A[off];
            ++j;
            ++off;
        }
        System.arraycopy(tmp, 0, A, i * n, n);
    }

    private static void sj(double[] tmp, double[] A, int j, int a, int n, int m) {
        int i;
        for (i = 0; i < m; ++i) {
            int sji = (j + i * n - i / a) % m * n;
            tmp[i] = A[sji + j];
        }
        i = j;
        int off = 0;
        while (i < m * n) {
            A[i] = tmp[off];
            i += n;
            ++off;
        }
    }

    private static void r2c(MatrixBlock in, int k) {
        int i;
        int j;
        int blkz;
        double[] A = in.getDenseBlockValues();
        int m = in.getNumRows();
        int n = in.getNumColumns();
        int c = LibMatrixReorg.gcd(m, n);
        int a = m / c;
        int b = n / c;
        int a_inv = LibMatrixReorg.modInverse(a, b);
        double[] tmp = memPool.get();
        if (tmp == null) {
            memPool.set(new double[Math.max(m, n)]);
            tmp = memPool.get();
        }
        ExecutorService pool = CommonThreadPool.get(k);
        ArrayList<Callable<Object>> tasks = new ArrayList<Callable<Object>>();
        if (m > 10 && n > 100) {
            try {
                blkz = Math.max(n / k, 1);
                int j2 = 0;
                while (j2 * blkz < n) {
                    tasks.add(new s_invTask(A, j2 * blkz, Math.min((j2 + 1) * blkz, n), a, n, m));
                    ++j2;
                }
                for (Future rt : pool.invokeAll(tasks)) {
                    rt.get();
                }
                tasks.clear();
            }
            catch (InterruptedException | ExecutionException ex) {
                throw new DMLRuntimeException("Failed parallel r2c transpose in place in inverse colum shuffle.", ex);
            }
        } else {
            for (j = 0; j < n; ++j) {
                LibMatrixReorg.sj_inv(tmp, A, j, a, n, m);
            }
        }
        if (m > 10 && n > 100) {
            try {
                blkz = Math.max(m / k, 1);
                int i2 = 0;
                while (i2 * blkz < m) {
                    tasks.add(new d_invTask(A, i2 * blkz, Math.min((i2 + 1) * blkz, m), a_inv, b, c, n, m));
                    ++i2;
                }
                for (Future rt : pool.invokeAll(tasks)) {
                    rt.get();
                }
                tasks.clear();
            }
            catch (InterruptedException | ExecutionException ex) {
                throw new DMLRuntimeException("Failed parallel r2c transpose in placein inverse row shuffle step.", ex);
            }
        } else if (b * b < 0) {
            for (i = 0; i < m; ++i) {
                LibMatrixReorg.di_inv_safe(tmp, A, i, a_inv, b, c, n, m);
            }
        } else {
            for (i = 0; i < m; ++i) {
                LibMatrixReorg.di_inv(tmp, A, i, a_inv, b, c, n, m);
            }
        }
        if (c > 1) {
            if (m > 10 && n > 100) {
                try {
                    blkz = Math.max((n - c) / k, 1);
                    int j3 = c;
                    while (j3 * blkz < n) {
                        tasks.add(new r_invTask(A, j3 * blkz, Math.min((j3 + 1) * blkz, n), b, n, m));
                        ++j3;
                    }
                    for (Future rt : pool.invokeAll(tasks)) {
                        rt.get();
                    }
                    tasks.clear();
                }
                catch (InterruptedException | ExecutionException ex) {
                    throw new DMLRuntimeException("Failed parallel r2c transpose in place inverse column rotate step.", ex);
                }
            } else {
                for (j = c; j < n; ++j) {
                    LibMatrixReorg.rj_inv(tmp, A, j, b, n, m);
                }
            }
        }
        memPool.remove();
    }

    private static void sj_inv(double[] tmp, double[] A, int j, int a, int n, int m) {
        int i = 0;
        int off = 0;
        while (i < m * n) {
            int sji = (j + i - off / a) % m;
            tmp[sji] = A[i + j];
            i += n;
            ++off;
        }
        i = j;
        off = 0;
        while (i < m * n) {
            A[i] = tmp[off];
            i += n;
            ++off;
        }
    }

    private static void di_inv(double[] tmp, double[] A, int i, int a_inv, int b, int c, int n, int m) {
        int off = i * n;
        int tmpIC = i + c;
        int tmpIN = i * (n - 1);
        int j = 0;
        while (j < n) {
            int f = tmpIC - j % c <= m ? j + tmpIN : j + tmpIN + m;
            int dij_inverse = a_inv % b * (f / c % b) % b + f % c * b;
            tmp[dij_inverse] = A[off];
            ++j;
            ++off;
        }
        System.arraycopy(tmp, 0, A, i * n, n);
    }

    private static void di_inv_safe(double[] tmp, double[] A, int i, int a_inv, int b, int c, int n, int m) {
        int off = i * n;
        int tmpIC = i + c;
        int tmpIN = i * (n - 1);
        int j = 0;
        while (j < n) {
            int f = tmpIC - j % c <= m ? j + tmpIN : j + tmpIN + m;
            int dij_inverse = (int)((long)(a_inv % b) * (long)(f / c % b) % (long)b) + f % c * b;
            tmp[dij_inverse] = A[off];
            ++j;
            ++off;
        }
        System.arraycopy(tmp, 0, A, i * n, n);
    }

    private static void rj_inv(double[] tmp, double[] A, int j, int b, int n, int m) {
        int i;
        int part = j / b;
        for (i = 0; i < m; ++i) {
            int rj = (i - part) % m;
            if (rj < 0) {
                rj += m;
            }
            tmp[i] = A[rj * n + j];
        }
        i = j;
        int off = 0;
        while (i < m * n) {
            A[i] = tmp[off];
            i += n;
            ++off;
        }
    }

    private static int modInverse(int a, int m) {
        a %= m;
        for (int x = 1; x < m; ++x) {
            if (a * x % m != 1) continue;
            return x;
        }
        return 1;
    }

    private static int gcd(int a, int b) {
        return a == 0 ? b : LibMatrixReorg.gcd(b % a, a);
    }

    static void transposeRow(double[] a, double[] c, int aix, int cix, int n2, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            c[cix] = a[aix + 0];
            ++j;
            ++aix;
            cix += n2;
        }
        j = bn;
        while (j < len) {
            c[cix + 0 * n2] = a[aix + 0];
            c[cix + 1 * n2] = a[aix + 1];
            c[cix + 2 * n2] = a[aix + 2];
            c[cix + 3 * n2] = a[aix + 3];
            c[cix + 4 * n2] = a[aix + 4];
            c[cix + 5 * n2] = a[aix + 5];
            c[cix + 6 * n2] = a[aix + 6];
            c[cix + 7 * n2] = a[aix + 7];
            j += 8;
            aix += 8;
            cix += 8 * n2;
        }
    }

    private static int[] countNnzPerColumn(MatrixBlock in, int maxCol) {
        return LibMatrixReorg.countNnzPerColumn(in, 0, in.getNumRows(), maxCol);
    }

    private static int[] countNnzPerColumn(MatrixBlock in, int rl, int ru, int maxCol) {
        int[] cnt = null;
        if (in.clen <= maxCol) {
            SparseBlock a = in.sparseBlock;
            cnt = new int[in.clen];
            for (int i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                LibMatrixReorg.countAgg(cnt, a.indexes(i), a.pos(i), a.size(i));
            }
        }
        return cnt;
    }

    public static int[] countNnzPerColumn(MatrixBlock in) {
        return LibMatrixReorg.countNnzPerColumn(in, 0, in.getNumRows());
    }

    public static int[] countNnzPerColumn(MatrixBlock in, int rl, int ru) {
        if (in.isInSparseFormat()) {
            return LibMatrixReorg.countNnzPerColumnSparse(in, rl, ru);
        }
        return LibMatrixReorg.countNnzPerColumnDense(in, rl, ru);
    }

    private static int[] countNnzPerColumnSparse(MatrixBlock in, int rl, int ru) {
        int[] cnt = new int[in.clen];
        SparseBlock a = in.sparseBlock;
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i)) continue;
            LibMatrixReorg.countAgg(cnt, a.indexes(i), a.pos(i), a.size(i));
        }
        return cnt;
    }

    private static int[] countNnzPerColumnDense(MatrixBlock in, int rl, int ru) {
        int[] cnt = new int[in.clen];
        double[] dV = in.getDenseBlockValues();
        int off = rl * in.clen;
        for (int i = rl; i < ru; ++i) {
            for (int j = 0; j < in.clen; ++j) {
                if (dV[off++] == 0.0) continue;
                int n = j;
                cnt[n] = cnt[n] + 1;
            }
        }
        return cnt;
    }

    public static int[] mergeNnzCounts(int[] cnt, int[] cnt2) {
        if (cnt == null) {
            return cnt2;
        }
        for (int i = 0; i < cnt.length; ++i) {
            int n = i;
            cnt[n] = cnt[n] + cnt2[i];
        }
        return cnt;
    }

    private static void reverseDense(MatrixBlock in, MatrixBlock out) {
        int m = in.rlen;
        int n = in.clen;
        out.sparse = false;
        out.nonZeros = in.nonZeros;
        out.allocateDenseBlock(false);
        if (n == 1) {
            double[] a = in.getDenseBlockValues();
            double[] c = out.getDenseBlockValues();
            for (int i = 0; i < m; ++i) {
                c[m - 1 - i] = a[i];
            }
        } else {
            DenseBlock a = in.getDenseBlock();
            DenseBlock c = out.getDenseBlock();
            for (int i = 0; i < m; ++i) {
                int ri = m - 1 - i;
                System.arraycopy(a.values(i), a.pos(i), c.values(ri), c.pos(ri), n);
            }
        }
    }

    private static void reverseSparse(MatrixBlock in, MatrixBlock out) {
        int m = in.rlen;
        out.sparse = true;
        out.nonZeros = in.nonZeros;
        out.allocateSparseRowsBlock(false);
        SparseBlock a = in.getSparseBlock();
        SparseBlock c = out.getSparseBlock();
        for (int i = 0; i < m; ++i) {
            if (a.isEmpty(i)) continue;
            c.set(m - 1 - i, a.get(i), true);
        }
    }

    private static void diagV2M(MatrixBlock in, MatrixBlock out) {
        int rlen = in.rlen;
        if (out.sparse) {
            int[] rptr = new int[in.rlen + 1];
            int[] cix = null;
            double[] vals = null;
            if ((long)rlen == in.nonZeros && !in.sparse) {
                cix = UtilFunctions.getSeqArray(0, rlen, 1);
                rptr = cix;
                vals = in.getDenseBlockValues();
            } else {
                cix = new int[(int)in.nonZeros];
                vals = new double[(int)in.nonZeros];
                int pos = 0;
                for (int i = 0; i < rlen; ++i) {
                    double val = in.quickGetValue(i, 0);
                    if (val != 0.0) {
                        cix[pos] = i;
                        vals[pos] = val;
                    }
                    rptr[i + 1] = ++pos;
                }
            }
            out.sparseBlock = new SparseBlockCSR(rptr, cix, vals, (int)in.nonZeros);
        } else {
            for (int i = 0; i < rlen; ++i) {
                double val = in.quickGetValue(i, 0);
                if (val == 0.0) continue;
                out.appendValue(i, i, val);
            }
        }
        out.setNonZeros(in.nonZeros);
    }

    private static void diagM2V(MatrixBlock in, MatrixBlock out) {
        DenseBlock c = out.allocateBlock().getDenseBlock();
        int rlen = in.rlen;
        int nnz = 0;
        for (int i = 0; i < rlen; ++i) {
            double val = in.quickGetValue(i, i);
            if (val == 0.0) continue;
            c.set(i, 0, val);
            ++nnz;
        }
        out.setNonZeros(nnz);
    }

    private static void reshapeDense(MatrixBlock in, MatrixBlock out, int rows, int cols, boolean rowwise) {
        int rlen = in.rlen;
        int clen = in.clen;
        if (in.denseBlock == null) {
            return;
        }
        if (rowwise && in.denseBlock.numBlocks() == 1) {
            out.denseBlock = DenseBlockFactory.createDenseBlock(in.getDenseBlockValues(), rows, cols);
            return;
        }
        out.allocateDenseBlock(false);
        DenseBlock a = in.getDenseBlock();
        DenseBlock c = out.getDenseBlock();
        if (rowwise) {
            c.set(a);
        } else if (rlen == 1 || clen == 1) {
            double[] avals = a.valuesAt(0);
            double[] cvals = c.valuesAt(0);
            int aix = 0;
            for (int j = 0; j < cols; ++j) {
                int i = 0;
                int cix = 0;
                while (i < rows) {
                    cvals[cix + j] = avals[aix++];
                    ++i;
                    cix += cols;
                }
            }
        } else if (rows == 1 || cols == 1) {
            double[] avals = a.valuesAt(0);
            double[] cvals = c.valuesAt(0);
            int cix = 0;
            for (int j = 0; j < clen; ++j) {
                int i = 0;
                int aix = 0;
                while (i < rlen) {
                    cvals[cix++] = avals[aix + j];
                    ++i;
                    aix += clen;
                }
            }
        } else {
            for (int i = 0; i < rows; ++i) {
                double[] cvals = c.values(i);
                int cix = c.pos(i);
                int j = 0;
                int aix2 = i;
                while (j < cols) {
                    int ai = aix2 % rlen;
                    int aj = aix2 / rlen;
                    cvals[cix + j] = a.get(ai, aj);
                    ++j;
                    aix2 += rows;
                }
            }
        }
    }

    private static void reshapeSparse(MatrixBlock in, MatrixBlock out, int rows, int cols, boolean rowwise) {
        int rlen = in.rlen;
        int clen = in.clen;
        if (in.isEmptyBlock(false)) {
            return;
        }
        out.allocateSparseRowsBlock(false);
        int estnnz = (int)(in.nonZeros / (long)rows);
        SparseBlock a = in.sparseBlock;
        SparseBlock c = out.sparseBlock;
        if (rowwise) {
            if (rows == 1) {
                c.allocate(0, estnnz, cols);
                int i = 0;
                int cix = 0;
                while (i < rlen) {
                    if (!a.isEmpty(i)) {
                        int apos = a.pos(i);
                        int alen = a.size(i);
                        int[] aix = a.indexes(i);
                        double[] avals = a.values(i);
                        for (int j = apos; j < apos + alen; ++j) {
                            c.append(0, cix + aix[j], avals[j]);
                        }
                    }
                    ++i;
                    cix += clen;
                }
            } else if (cols % clen == 0 && in.nonZeros < Integer.MAX_VALUE) {
                int n = cols / clen;
                int pos = 0;
                int[] rptr = new int[rows + 1];
                int[] indexes = new int[(int)a.size()];
                double[] values = null;
                rptr[0] = 0;
                if (a instanceof SparseBlockCSR) {
                    int[] aix = ((SparseBlockCSR)a).indexes();
                    int bi = 0;
                    int ci = 0;
                    while (bi < rlen) {
                        int i = bi;
                        int cix = 0;
                        while (i < bi + n) {
                            if (!a.isEmpty(i)) {
                                int apos = a.pos(i);
                                int alen = a.size(i);
                                for (int j = apos; j < apos + alen; ++j) {
                                    indexes[pos++] = cix + aix[j];
                                }
                            }
                            ++i;
                            cix += clen;
                        }
                        rptr[ci + 1] = pos;
                        bi += n;
                        ++ci;
                    }
                    values = ((SparseBlockCSR)a).values();
                } else {
                    values = new double[indexes.length];
                    int bi = 0;
                    int ci = 0;
                    while (bi < rlen) {
                        int i = bi;
                        int cix = 0;
                        while (i < bi + n) {
                            if (!a.isEmpty(i)) {
                                int apos = a.pos(i);
                                int alen = a.size(i);
                                int[] aix = a.indexes(i);
                                System.arraycopy(a.values(i), apos, values, pos, alen);
                                for (int j = apos; j < apos + alen; ++j) {
                                    indexes[pos++] = cix + aix[j];
                                }
                            }
                            ++i;
                            cix += clen;
                        }
                        rptr[ci + 1] = pos;
                        bi += n;
                        ++ci;
                    }
                }
                out.sparseBlock = new SparseBlockCSR(rptr, indexes, values, pos);
            } else if (cols % clen == 0) {
                int n = cols / clen;
                int bi = 0;
                int ci = 0;
                while (bi < rlen) {
                    long lnnz = a.size(bi, bi + n);
                    c.allocate(ci, (int)lnnz);
                    int i = bi;
                    int cix = 0;
                    while (i < bi + n) {
                        if (!a.isEmpty(i)) {
                            int apos = a.pos(i);
                            int alen = a.size(i);
                            int[] aix = a.indexes(i);
                            double[] avals = a.values(i);
                            for (int j = apos; j < apos + alen; ++j) {
                                c.append(ci, cix + aix[j], avals[j]);
                            }
                        }
                        ++i;
                        cix += clen;
                    }
                    bi += n;
                    ++ci;
                }
            } else {
                long cix = 0L;
                for (int i = 0; i < rlen; ++i) {
                    if (!a.isEmpty(i)) {
                        int apos = a.pos(i);
                        int alen = a.size(i);
                        int[] aix = a.indexes(i);
                        double[] avals = a.values(i);
                        for (int j = apos; j < apos + alen; ++j) {
                            int ci = (int)((cix + (long)aix[j]) / (long)cols);
                            int cj = (int)((cix + (long)aix[j]) % (long)cols);
                            c.allocate(ci, estnnz, cols);
                            c.append(ci, cj, avals[j]);
                        }
                    }
                    cix += (long)clen;
                }
            }
        } else if (rlen == 1) {
            if (!a.isEmpty(0)) {
                int alen = a.size(0);
                int[] aix = a.indexes(0);
                double[] avals = a.values(0);
                for (int j = 0; j < alen; ++j) {
                    int ci = aix[j] % rows;
                    int cj = aix[j] / rows;
                    c.allocate(ci, estnnz, cols);
                    c.append(ci, cj, avals[j]);
                }
            }
        } else {
            for (int i = 0; i < rlen; ++i) {
                if (a.isEmpty(i)) continue;
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                for (int j = apos; j < apos + alen; ++j) {
                    long tmpix = (long)aix[j] * (long)rlen + (long)i;
                    int ci = (int)(tmpix % (long)rows);
                    int cj = (int)(tmpix / (long)rows);
                    c.allocate(ci, estnnz, cols);
                    c.append(ci, cj, avals[j]);
                }
            }
            out.sortSparseRows();
        }
    }

    private static void reshapeDenseToSparse(MatrixBlock in, MatrixBlock out, int rows, int cols, boolean rowwise) {
        int rlen = in.rlen;
        int clen = in.clen;
        if (in.denseBlock == null) {
            return;
        }
        out.allocateSparseRowsBlock(false);
        int estnnz = (int)(in.nonZeros / (long)rows);
        DenseBlock a = in.getDenseBlock();
        SparseBlock c = out.sparseBlock;
        if (rowwise) {
            for (int i = 0; i < rlen; ++i) {
                double[] avals = a.values(i);
                int aix = a.pos(i);
                for (int j = 0; j < clen; ++j) {
                    double val = avals[aix + j];
                    if (val == 0.0) continue;
                    long cix = (long)i * (long)clen + (long)j;
                    int ci = (int)(cix / (long)cols);
                    int cj = (int)(cix % (long)cols);
                    c.allocate(ci, estnnz, cols);
                    c.append(ci, cj, val);
                }
            }
        } else if (rlen == 1) {
            double[] avals = a.valuesAt(0);
            int aix = 0;
            for (int j = 0; j < cols; ++j) {
                for (int i = 0; i < rows; ++i) {
                    double val;
                    if ((val = avals[aix++]) == 0.0) continue;
                    c.allocate(i, estnnz, cols);
                    c.append(i, j, val);
                }
            }
        } else {
            for (int i = 0; i < rows; ++i) {
                int j = 0;
                int aix2 = i;
                while (j < cols) {
                    int ai = aix2 % rlen;
                    int aj = aix2 / rlen;
                    double val = a.get(ai, aj);
                    if (val != 0.0) {
                        c.allocate(i, estnnz, cols);
                        c.append(i, j, val);
                    }
                    ++j;
                    aix2 += rows;
                }
            }
        }
    }

    private static void reshapeSparseToDense(MatrixBlock in, MatrixBlock out, int rows, int cols, boolean rowwise) {
        block8: {
            DenseBlock c;
            SparseBlock a;
            int rlen;
            block9: {
                block7: {
                    rlen = in.rlen;
                    int clen = in.clen;
                    if (in.sparseBlock == null) {
                        return;
                    }
                    out.allocateDenseBlock(false);
                    a = in.sparseBlock;
                    c = out.getDenseBlock();
                    if (!rowwise) break block7;
                    int i = 0;
                    int cix = 0;
                    while (i < rlen) {
                        if (!a.isEmpty(i)) {
                            int apos = a.pos(i);
                            int alen = a.size(i);
                            int[] aix = a.indexes(i);
                            double[] avals = a.values(i);
                            for (int j = apos; j < apos + alen; ++j) {
                                int ci = (cix + aix[j]) / cols;
                                int cj = (cix + aix[j]) % cols;
                                c.set(ci, cj, avals[j]);
                            }
                        }
                        ++i;
                        cix += clen;
                    }
                    break block8;
                }
                if (rlen != 1) break block9;
                double[] cvals = c.valuesAt(0);
                if (a.isEmpty(0)) break block8;
                int apos = a.pos(0);
                int alen = a.size(0);
                int[] aix = a.indexes(0);
                double[] avals = a.values(0);
                for (int j = apos; j < apos + alen; ++j) {
                    int ci = aix[j] % rows;
                    int cj = aix[j] / rows;
                    cvals[ci * cols + cj] = avals[j];
                }
                break block8;
            }
            for (int i = 0; i < rlen; ++i) {
                if (a.isEmpty(i)) continue;
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                for (int j = apos; j < apos + alen; ++j) {
                    int tmpix = aix[j] * rlen + i;
                    int ci = tmpix % rows;
                    int cj = tmpix / rows;
                    c.set(ci, cj, avals[j]);
                }
            }
        }
    }

    private static Collection<MatrixIndexes> computeAllResultBlockIndexes(MatrixIndexes ixin, DataCharacteristics mcIn, DataCharacteristics mcOut, MatrixBlock in, boolean rowwise, boolean outputEmpty) {
        HashSet<MatrixIndexes> ret = new HashSet<MatrixIndexes>();
        long row_offset = (ixin.getRowIndex() - 1L) * (long)mcOut.getBlocksize();
        long col_offset = (ixin.getColumnIndex() - 1L) * (long)mcOut.getBlocksize();
        long max_row_offset = Math.min(mcIn.getRows(), row_offset + (long)mcIn.getBlocksize()) - 1L;
        long max_col_offset = Math.min(mcIn.getCols(), col_offset + (long)mcIn.getBlocksize()) - 1L;
        if (rowwise) {
            if (mcIn.getCols() == 1L) {
                MatrixIndexes first = LibMatrixReorg.computeResultBlockIndex(new MatrixIndexes(), row_offset, 0L, mcIn, mcOut, rowwise);
                MatrixIndexes last = LibMatrixReorg.computeResultBlockIndex(new MatrixIndexes(), max_row_offset, 0L, mcIn, mcOut, rowwise);
                LibMatrixReorg.createRowwiseIndexes(first, last, mcOut.getNumColBlocks(), ret);
            } else if (in.getNonZeros() < (long)in.getNumRows() && !outputEmpty) {
                LibMatrixReorg.createNonZeroIndexes(mcIn, mcOut, in, row_offset, col_offset, rowwise, ret);
            } else {
                for (long i = row_offset; i < max_row_offset + 1L; ++i) {
                    MatrixIndexes first = LibMatrixReorg.computeResultBlockIndex(new MatrixIndexes(), i, col_offset, mcIn, mcOut, rowwise);
                    MatrixIndexes last = LibMatrixReorg.computeResultBlockIndex(new MatrixIndexes(), i, max_col_offset, mcIn, mcOut, rowwise);
                    LibMatrixReorg.createRowwiseIndexes(first, last, mcOut.getNumColBlocks(), ret);
                }
            }
        } else if (mcIn.getRows() == 1L) {
            MatrixIndexes first = LibMatrixReorg.computeResultBlockIndex(new MatrixIndexes(), 0L, col_offset, mcIn, mcOut, rowwise);
            MatrixIndexes last = LibMatrixReorg.computeResultBlockIndex(new MatrixIndexes(), 0L, max_col_offset, mcIn, mcOut, rowwise);
            LibMatrixReorg.createColwiseIndexes(first, last, mcOut.getNumRowBlocks(), ret);
        } else if (in.getNonZeros() < (long)in.getNumColumns() && !outputEmpty) {
            LibMatrixReorg.createNonZeroIndexes(mcIn, mcOut, in, row_offset, col_offset, rowwise, ret);
        } else {
            for (long j = col_offset; j < max_col_offset + 1L; ++j) {
                MatrixIndexes first = LibMatrixReorg.computeResultBlockIndex(new MatrixIndexes(), row_offset, j, mcIn, mcOut, rowwise);
                MatrixIndexes last = LibMatrixReorg.computeResultBlockIndex(new MatrixIndexes(), max_row_offset, j, mcIn, mcOut, rowwise);
                LibMatrixReorg.createColwiseIndexes(first, last, mcOut.getNumRowBlocks(), ret);
            }
        }
        return ret;
    }

    private static void createRowwiseIndexes(MatrixIndexes first, MatrixIndexes last, long ncblks, HashSet<MatrixIndexes> ret) {
        if (first.getRowIndex() <= 0L || first.getColumnIndex() <= 0L) {
            throw new RuntimeException("Invalid computed first index: " + first.toString());
        }
        if (last.getRowIndex() <= 0L || last.getColumnIndex() <= 0L) {
            throw new RuntimeException("Invalid computed last index: " + last.toString());
        }
        ret.add(first);
        if (!first.equals(last)) {
            boolean fill = first.getRowIndex() == last.getRowIndex() && first.getColumnIndex() > last.getColumnIndex();
            for (long k1 = first.getRowIndex(); k1 <= last.getRowIndex(); ++k1) {
                long k2_start = k1 == first.getRowIndex() ? first.getColumnIndex() + 1L : 1L;
                long k2_end = k1 == last.getRowIndex() && !fill ? last.getColumnIndex() - 1L : ncblks;
                for (long k2 = k2_start; k2 <= k2_end; ++k2) {
                    ret.add(new MatrixIndexes(k1, k2));
                }
            }
            ret.add(last);
        }
    }

    private static void createColwiseIndexes(MatrixIndexes first, MatrixIndexes last, long nrblks, HashSet<MatrixIndexes> ret) {
        if (first.getRowIndex() <= 0L || first.getColumnIndex() <= 0L) {
            throw new RuntimeException("Invalid computed first index: " + first.toString());
        }
        if (last.getRowIndex() <= 0L || last.getColumnIndex() <= 0L) {
            throw new RuntimeException("Invalid computed last index: " + last.toString());
        }
        ret.add(first);
        if (!first.equals(last)) {
            boolean fill = first.getColumnIndex() == last.getColumnIndex() && first.getRowIndex() > last.getRowIndex();
            for (long k1 = first.getColumnIndex(); k1 <= last.getColumnIndex(); ++k1) {
                long k2_start = k1 == first.getColumnIndex() ? first.getRowIndex() + 1L : 1L;
                long k2_end = k1 == last.getColumnIndex() && !fill ? last.getRowIndex() - 1L : nrblks;
                for (long k2 = k2_start; k2 <= k2_end; ++k2) {
                    ret.add(new MatrixIndexes(k2, k1));
                }
            }
            ret.add(last);
        }
    }

    private static void createNonZeroIndexes(DataCharacteristics mcIn, DataCharacteristics mcOut, MatrixBlock in, long row_offset, long col_offset, boolean rowwise, HashSet<MatrixIndexes> ret) {
        Iterator<IJV> iter = in.getSparseBlockIterator();
        while (iter.hasNext()) {
            IJV cell = iter.next();
            ret.add(LibMatrixReorg.computeResultBlockIndex(new MatrixIndexes(), row_offset + (long)cell.getI(), col_offset + (long)cell.getJ(), mcIn, mcOut, rowwise));
        }
    }

    private static Map<MatrixIndexes, MatrixBlock> createAllResultBlocks(Collection<MatrixIndexes> rix, long nnz, DataCharacteristics mcOut) {
        return rix.stream().collect(Collectors.toMap(ix -> ix, ix -> LibMatrixReorg.createResultBlock(ix, nnz, rix.size(), mcOut)));
    }

    private static MatrixBlock createResultBlock(MatrixIndexes ix, long nnz, int nBlocks, DataCharacteristics mcOut) {
        long bi = ix.getRowIndex();
        long bj = ix.getColumnIndex();
        int lbrlen = UtilFunctions.computeBlockSize(mcOut.getRows(), bi, mcOut.getBlocksize());
        int lbclen = UtilFunctions.computeBlockSize(mcOut.getCols(), bj, mcOut.getBlocksize());
        if (lbrlen < 1 || lbclen < 1) {
            throw new DMLRuntimeException("Computed block dimensions (" + bi + "," + bj + " -> " + lbrlen + "," + lbclen + ") are invalid!");
        }
        int estnnz = (int)(nnz / (long)nBlocks);
        boolean sparse = MatrixBlock.evalSparseFormatInMemory(lbrlen, lbclen, estnnz);
        return new MatrixBlock(lbrlen, lbclen, sparse, estnnz);
    }

    private static void reshapeDense(MatrixBlock in, long row_offset, long col_offset, Map<MatrixIndexes, MatrixBlock> rix, DataCharacteristics mcIn, DataCharacteristics mcOut, boolean rowwise) {
        if (in.isEmptyBlock(false)) {
            return;
        }
        int rlen = in.rlen;
        int clen = in.clen;
        double[] a = in.getDenseBlockValues();
        MatrixIndexes ixtmp = new MatrixIndexes();
        int i = 0;
        int aix = 0;
        while (i < rlen) {
            long ai = row_offset + (long)i;
            for (int j = 0; j < clen; ++j) {
                double val = a[aix + j];
                if (val == 0.0) continue;
                long aj = col_offset + (long)j;
                MatrixBlock out = rix.get(ixtmp = LibMatrixReorg.computeResultBlockIndex(ixtmp, ai, aj, mcIn, mcOut, rowwise));
                if (out == null) {
                    throw new DMLRuntimeException("Missing result block: " + ixtmp);
                }
                ixtmp = LibMatrixReorg.computeInBlockIndex(ixtmp, ai, aj, mcIn, mcOut, rowwise);
                out.appendValue((int)ixtmp.getRowIndex(), (int)ixtmp.getColumnIndex(), val);
            }
            ++i;
            aix += clen;
        }
        if (!rowwise && mcIn.getRows() > 1L) {
            rix.values().stream().filter(b -> b.sparse).forEach(b -> b.sortSparseRows());
        }
    }

    private static void reshapeSparse(MatrixBlock in, long row_offset, long col_offset, Map<MatrixIndexes, MatrixBlock> rix, DataCharacteristics mcIn, DataCharacteristics mcOut, boolean rowwise) {
        if (in.isEmptyBlock(false)) {
            return;
        }
        int rlen = in.rlen;
        SparseBlock a = in.sparseBlock;
        MatrixIndexes ixtmp = new MatrixIndexes();
        for (int i = 0; i < rlen; ++i) {
            if (a.isEmpty(i)) continue;
            long ai = row_offset + (long)i;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            for (int j = apos; j < apos + alen; ++j) {
                long aj = col_offset + (long)aix[j];
                ixtmp = LibMatrixReorg.computeResultBlockIndex(ixtmp, ai, aj, mcIn, mcOut, rowwise);
                MatrixBlock out = LibMatrixReorg.getAllocatedBlock(rix, ixtmp);
                ixtmp = LibMatrixReorg.computeInBlockIndex(ixtmp, ai, aj, mcIn, mcOut, rowwise);
                out.appendValue((int)ixtmp.getRowIndex(), (int)ixtmp.getColumnIndex(), avals[j]);
            }
        }
        if (!rowwise && mcIn.getRows() > 1L) {
            rix.values().stream().filter(b -> b.sparse).forEach(b -> b.sortSparseRows());
        }
    }

    private static MatrixBlock getAllocatedBlock(Map<MatrixIndexes, MatrixBlock> rix, MatrixIndexes ix) {
        MatrixBlock out = rix.get(ix);
        if (out == null) {
            throw new DMLRuntimeException("Missing result block: " + ix);
        }
        return out;
    }

    private static MatrixIndexes computeResultBlockIndex(MatrixIndexes ixout, long ai, long aj, DataCharacteristics mcIn, DataCharacteristics mcOut, boolean rowwise) {
        long tempc = LibMatrixReorg.computeGlobalCellIndex(mcIn, ai, aj, rowwise);
        long ci = rowwise ? tempc / mcOut.getCols() : tempc % mcOut.getRows();
        long cj = rowwise ? tempc % mcOut.getCols() : tempc / mcOut.getRows();
        long bci = ci / (long)mcOut.getBlocksize() + 1L;
        long bcj = cj / (long)mcOut.getBlocksize() + 1L;
        return ixout.setIndexes(bci, bcj);
    }

    private static MatrixIndexes computeInBlockIndex(MatrixIndexes ixout, long ai, long aj, DataCharacteristics mcIn, DataCharacteristics mcOut, boolean rowwise) {
        long tempc = LibMatrixReorg.computeGlobalCellIndex(mcIn, ai, aj, rowwise);
        long ci = rowwise ? tempc / mcOut.getCols() % (long)mcOut.getBlocksize() : tempc % mcOut.getRows() % (long)mcOut.getBlocksize();
        long cj = rowwise ? tempc % mcOut.getCols() % (long)mcOut.getBlocksize() : tempc / mcOut.getRows() % (long)mcOut.getBlocksize();
        return ixout.setIndexes(ci, cj);
    }

    private static long computeGlobalCellIndex(DataCharacteristics mcIn, long ai, long aj, boolean rowwise) {
        return rowwise ? ai * mcIn.getCols() + aj : ai + mcIn.getRows() * aj;
    }

    private static MatrixBlock removeEmptyRows(MatrixBlock in, MatrixBlock ret, MatrixBlock select, boolean emptyReturn) {
        int i;
        int i2;
        int m = in.rlen;
        int n = in.clen;
        boolean[] flags = null;
        int rlen2 = 0;
        if (in.sparse && !in.isEmptyBlock(false) && select == null && in.sparseBlock instanceof SparseBlockCSR && in.nonZeros < Integer.MAX_VALUE) {
            SparseBlockCSR sblock = (SparseBlockCSR)in.sparseBlock;
            int lrlen = 0;
            for (i2 = 0; i2 < m; ++i2) {
                lrlen += sblock.isEmpty(i2) ? 0 : 1;
            }
            if (MatrixBlock.evalSparseFormatInMemory(lrlen, n, in.nonZeros)) {
                int[] rptr = new int[lrlen + 1];
                int j = 0;
                int pos = 0;
                for (int i3 = 0; i3 < m; ++i3) {
                    if (sblock.isEmpty(i3)) continue;
                    rptr[++j] = pos += sblock.size(i3);
                }
                ret.reset(lrlen, in.clen, true);
                ret.sparseBlock = new SparseBlockCSR(rptr, sblock.indexes(), sblock.values(), (int)in.nonZeros);
                ret.nonZeros = in.nonZeros;
                return ret;
            }
        }
        if (select == null) {
            Block a;
            flags = new boolean[m];
            if (in.sparse) {
                a = in.sparseBlock;
                for (i = 0; i < m; ++i) {
                    flags[i] = !((SparseBlock)a).isEmpty(i);
                    rlen2 += flags[i] ? 1 : 0;
                }
            } else {
                a = in.getDenseBlock();
                block3: for (i = 0; i < m; ++i) {
                    double[] avals = ((DenseBlock)a).values(i);
                    int aix = ((DenseBlock)a).pos(i);
                    for (int j = 0; j < n; ++j) {
                        if (avals[aix + j] == 0.0) continue;
                        flags[i] = true;
                        ++rlen2;
                        continue block3;
                    }
                }
            }
        } else {
            flags = DataConverter.convertToBooleanVector(select);
            rlen2 = (int)select.getNonZeros();
        }
        rlen2 = Math.max(rlen2, emptyReturn ? 1 : 0);
        boolean sp = MatrixBlock.evalSparseFormatInMemory(rlen2, n, in.nonZeros);
        ret.reset(rlen2, n, sp);
        if (in.isEmptyBlock(false)) {
            return ret;
        }
        if (m == rlen2) {
            ret.sparse = in.sparse;
            if (ret.sparse) {
                ret.sparseBlock = in.sparseBlock;
            } else {
                ret.denseBlock = in.denseBlock;
            }
        } else if (in.sparse) {
            int cix = 0;
            for (i = 0; i < m; ++i) {
                if (!flags[i]) continue;
                ret.appendRow(cix++, in.sparseBlock.get(i), false);
            }
        } else if (!in.sparse && !ret.sparse) {
            ret.allocateDenseBlock();
            DenseBlock a = in.getDenseBlock();
            DenseBlock c = ret.getDenseBlock();
            int ci = 0;
            for (int i4 = 0; i4 < m; ++i4) {
                if (!flags[i4]) continue;
                System.arraycopy(a.values(i4), a.pos(i4), c.values(ci), c.pos(ci), n);
                ++ci;
            }
        } else {
            ret.allocateSparseRowsBlock();
            DenseBlock a = in.getDenseBlock();
            int ci = 0;
            for (i2 = 0; i2 < m; ++i2) {
                if (!flags[i2]) continue;
                double[] avals = a.values(i2);
                int aix = a.pos(i2);
                for (int j = 0; j < n; ++j) {
                    ret.appendValue(ci, j, avals[aix + j]);
                }
                ++ci;
            }
        }
        ret.nonZeros = select == null ? in.nonZeros : ret.recomputeNonZeros();
        ret.examSparsity();
        return ret;
    }

    private static MatrixBlock removeEmptyColumns(MatrixBlock in, MatrixBlock ret, MatrixBlock select, boolean emptyReturn) {
        int m = in.rlen;
        int n = in.clen;
        boolean[] flags = null;
        if (select == null) {
            int i;
            Block a;
            flags = new boolean[n];
            if (in.sparse) {
                a = in.sparseBlock;
                for (i = 0; i < m; ++i) {
                    if (((SparseBlock)a).isEmpty(i)) continue;
                    int apos = ((SparseBlock)a).pos(i);
                    int alen = ((SparseBlock)a).size(i);
                    int[] aix = ((SparseBlock)a).indexes(i);
                    for (int j = apos; j < apos + alen; ++j) {
                        flags[aix[j]] = true;
                    }
                }
            } else {
                a = in.getDenseBlock();
                for (i = 0; i < m; ++i) {
                    double[] avals = ((DenseBlock)a).values(i);
                    int aix = ((DenseBlock)a).pos(i);
                    for (int j = 0; j < n; ++j) {
                        int n2 = j;
                        flags[n2] = flags[n2] | avals[aix + j] != 0.0;
                    }
                }
            }
        } else {
            flags = DataConverter.convertToBooleanVector(select);
        }
        int clen2 = 0;
        for (int j = 0; j < n; ++j) {
            clen2 += flags[j] ? 1 : 0;
        }
        clen2 = Math.max(clen2, emptyReturn ? 1 : 0);
        boolean sp = MatrixBlock.evalSparseFormatInMemory(m, clen2, in.nonZeros);
        ret.reset(m, clen2, sp);
        if (in.isEmptyBlock(false)) {
            return ret;
        }
        if (n == clen2) {
            ret.sparse = in.sparse;
            if (ret.sparse) {
                ret.sparseBlock = in.sparseBlock;
            } else {
                ret.denseBlock = in.denseBlock;
            }
        } else {
            int i;
            int[] cix = new int[n];
            int pos = 0;
            for (int j = 0; j < n; ++j) {
                if (!flags[j]) continue;
                cix[j] = pos++;
            }
            if (in.sparse) {
                SparseBlock a = in.sparseBlock;
                for (i = 0; i < m; ++i) {
                    if (a.isEmpty(i)) continue;
                    int apos = a.pos(i);
                    int alen = a.size(i);
                    int[] aix = a.indexes(i);
                    double[] avals = a.values(i);
                    for (int j = apos; j < apos + alen; ++j) {
                        if (!flags[aix[j]]) continue;
                        ret.appendValue(i, cix[aix[j]], avals[j]);
                    }
                }
            } else if (!in.sparse && !ret.sparse) {
                ret.allocateDenseBlock();
                DenseBlock a = in.getDenseBlock();
                DenseBlock c = ret.getDenseBlock();
                for (int i2 = 0; i2 < m; ++i2) {
                    double[] avals = a.values(i2);
                    double[] cvals = c.values(i2);
                    int aix = a.pos(i2);
                    int lcix = c.pos(i2);
                    for (int j = 0; j < n; ++j) {
                        if (!flags[j]) continue;
                        cvals[lcix + cix[j]] = avals[aix + j];
                    }
                }
            } else {
                ret.allocateSparseRowsBlock();
                DenseBlock a = in.getDenseBlock();
                for (i = 0; i < m; ++i) {
                    double[] avals = a.values(i);
                    int aix = a.pos(i);
                    for (int j = 0; j < n; ++j) {
                        double aval = avals[aix + j];
                        if (!flags[j] || aval == 0.0) continue;
                        ret.appendValue(i, cix[j], aval);
                    }
                }
            }
        }
        ret.nonZeros = select == null ? in.nonZeros : ret.recomputeNonZeros();
        ret.examSparsity();
        return ret;
    }

    private static MatrixBlock rexpandRows(MatrixBlock in, MatrixBlock ret, int max, boolean cast, boolean ignore) {
        int rlen = max;
        int clen = in.rlen;
        long nnz = in.nonZeros;
        boolean sp = MatrixBlock.evalSparseFormatInMemory(rlen, clen, nnz);
        ret.reset(rlen, clen, sp);
        int blksize = 0x100000;
        int[] tmpi = new int[Math.min(0x100000, clen)];
        double[] tmp = new double[Math.min(0x100000, clen)];
        for (int i = 0; i < clen; i += 0x100000) {
            int len = Math.min(0x100000, clen - i);
            LibMatrixReorg.copyColVector(in, i, tmp, tmpi, len);
            SortUtils.sortByValue(0, len, tmp, tmpi);
            for (int j = 0; j < len; ++j) {
                double val = tmp[j];
                if (cast) {
                    val = UtilFunctions.toLong(val);
                }
                if (!ignore && val <= 0.0) {
                    throw new DMLRuntimeException("Invalid input value <= 0 for ignore=false: " + val);
                }
                if (val != Math.floor(val) || !(val >= 1.0) || !(val <= (double)max)) continue;
                ret.appendValue((int)(val - 1.0), tmpi[j], 1.0);
            }
        }
        if (ret.isInSparseFormat()) {
            ret.sortSparseRows();
        }
        return ret;
    }

    private static MatrixBlock rexpandColumns(MatrixBlock in, MatrixBlock ret, int max, boolean cast, boolean ignore, int k) {
        int rlen = in.rlen;
        int clen = max;
        long nnz = in.nonZeros;
        boolean sp = MatrixBlock.evalSparseFormatInMemory(rlen, clen, nnz);
        ret.reset(rlen, clen, sp);
        ret.allocateBlock();
        long rnnz = 0L;
        if (k <= 1 || (long)in.getNumRows() <= PAR_NUMCELL_THRESHOLD || sp) {
            rnnz = LibMatrixReorg.rexpandColumns(in, ret, max, cast, ignore, 0, rlen);
        } else {
            try {
                ExecutorService pool = CommonThreadPool.get(k);
                ArrayList<RExpandColsTask> tasks = new ArrayList<RExpandColsTask>();
                int blklen = (int)Math.ceil((double)rlen / (double)k / 8.0);
                int i = 0;
                while (i < 8 * k & i * blklen < rlen) {
                    tasks.add(new RExpandColsTask(in, ret, max, cast, ignore, i * blklen, Math.min((i + 1) * blklen, rlen)));
                    ++i;
                }
                List taskret = pool.invokeAll(tasks);
                pool.shutdown();
                for (Future task : taskret) {
                    rnnz += ((Long)task.get()).longValue();
                }
            }
            catch (Exception ex) {
                throw new DMLRuntimeException(ex);
            }
        }
        ret.setNonZeros(rnnz);
        return ret;
    }

    private static long rexpandColumns(MatrixBlock in, MatrixBlock ret, int max, boolean cast, boolean ignore, int rl, int ru) {
        int lnnz = 0;
        int[] cix = null;
        if (ret.sparse) {
            cix = new int[in.rlen];
            Arrays.fill(cix, -1);
        }
        DenseBlock cd = ret.getDenseBlock();
        SparseBlock cs = ret.getSparseBlock();
        for (int i = rl; i < ru; ++i) {
            double val = in.quickGetValue(i, 0);
            if (cast) {
                val = UtilFunctions.toLong(val);
            }
            if (!ignore && val <= 0.0) {
                throw new DMLRuntimeException("Invalid input value <= 0 for ignore=false: " + val);
            }
            if (val != Math.floor(val) || !(val >= 1.0) || !(val <= (double)max)) continue;
            if (cix != null) {
                cix[i] = (int)(val - 1.0);
            } else if (ret.sparse) {
                cs.allocate(i, 1);
                cs.append(i, (int)(val - 1.0), 1.0);
            } else {
                cd.set(i, (int)(val - 1.0), 1.0);
            }
            ++lnnz;
        }
        if (cix != null) {
            ret.sparseBlock = new SparseBlockCSR(in.rlen, lnnz, cix);
        }
        return ret.setNonZeros(lnnz);
    }

    private static void copyColVector(MatrixBlock in, int ixin, double[] tmp, int[] tmpi, int len) {
        int i;
        if (in.isEmptyBlock(false)) {
            Arrays.fill(tmp, 0, len, 0.0);
        } else if (in.sparse) {
            for (i = 0; i < len; ++i) {
                tmp[i] = in.quickGetValue(ixin + i, 0);
            }
        } else {
            System.arraycopy(in.getDenseBlockValues(), ixin, tmp, 0, len);
        }
        for (i = 0; i < len; ++i) {
            tmpi[i] = ixin + i;
        }
    }

    private static void sortReverseDense(MatrixBlock m1) {
        int rlen = m1.rlen;
        double[] a = m1.getDenseBlockValues();
        for (int i = 0; i < rlen / 2; ++i) {
            double tmp = a[i];
            a[i] = a[rlen - i - 1];
            a[rlen - i - 1] = tmp;
        }
    }

    private static void sortReverseDense(int[] a) {
        int rlen = a.length;
        for (int i = 0; i < rlen / 2; ++i) {
            int tmp = a[i];
            a[i] = a[rlen - i - 1];
            a[rlen - i - 1] = tmp;
        }
    }

    private static void sortReverseDense(double[] a) {
        int rlen = a.length;
        for (int i = 0; i < rlen / 2; ++i) {
            double tmp = a[i];
            a[i] = a[rlen - i - 1];
            a[rlen - i - 1] = tmp;
        }
    }

    private static void mergeSortedBlocks(int blockLength, int[] valueIndexes, double[] values, int k) {
        int mergeBlockSize = blockLength * 2;
        int vlen = values.length;
        if (mergeBlockSize <= vlen + blockLength) {
            try {
                ExecutorService pool = CommonThreadPool.get(k);
                ArrayList<MergeTask> tasks = new ArrayList<MergeTask>();
                int i = 0;
                while (i * mergeBlockSize < vlen) {
                    int start = i * mergeBlockSize;
                    if (start + blockLength < vlen) {
                        int stop = Math.min(vlen, (i + 1) * mergeBlockSize);
                        tasks.add(new MergeTask(start, stop, blockLength, valueIndexes, values));
                    }
                    ++i;
                }
                CommonThreadPool.invokeAndShutdown(pool, tasks);
                LibMatrixReorg.mergeSortedBlocks(mergeBlockSize, valueIndexes, values, k);
            }
            catch (Exception ex) {
                throw new DMLRuntimeException(ex);
            }
        }
    }

    private static void sortBySecondary(int rl, int ru, double[] values, int[] vix, MatrixBlock in, int[] by, int off) {
        for (int i = rl; i < ru - 1; ++i) {
            double tmp = values[i];
            int len = 0;
            while (i + len + 1 < ru && tmp == values[i + len + 1]) {
                ++len;
            }
            if (len <= 0) continue;
            double old = values[i];
            for (int j = i; j < i + len + 1; ++j) {
                values[j] = in.quickGetValue(vix[j], by[off] - 1);
            }
            SortUtils.sortByValue(i, i + len + 1, values, vix);
            if (off + 1 < by.length) {
                LibMatrixReorg.sortBySecondary(i, i + len + 1, values, vix, in, by, off + 1);
            }
            Arrays.fill(values, i, i + len + 1, old);
            i += len;
        }
    }

    private static void sortIndexesStable(int rl, int ru, double[] values, int[] vix, MatrixBlock in, int[] by, int off) {
        for (int i = rl; i < ru - 1; ++i) {
            double tmp = values[i];
            int len = 0;
            while (i + len + 1 < ru && tmp == values[i + len + 1]) {
                ++len;
            }
            if (len <= 0) continue;
            if (off < by.length) {
                for (int j = i; j < i + len + 1; ++j) {
                    values[j] = in.quickGetValue(vix[j], by[off] - 1);
                }
                LibMatrixReorg.sortIndexesStable(i, i + len + 1, values, vix, in, by, off + 1);
            } else {
                Arrays.sort(vix, i, i + len + 1);
            }
            i += len;
        }
    }

    private static boolean isValidSortByList(int[] by, int clen) {
        if (by == null || by.length == 0 || by.length > clen) {
            return false;
        }
        for (int i = 0; i < by.length; ++i) {
            if (by[i] > 0 && clen >= by[i]) continue;
            return false;
        }
        return true;
    }

    private static void countAgg(int[] c, int[] ai, int len) {
        int i;
        int bn = len % 8;
        for (i = 0; i < bn; ++i) {
            int n = ai[i];
            c[n] = c[n] + 1;
        }
        for (i = bn; i < len; i += 8) {
            int n = ai[i + 0];
            c[n] = c[n] + 1;
            int n2 = ai[i + 1];
            c[n2] = c[n2] + 1;
            int n3 = ai[i + 2];
            c[n3] = c[n3] + 1;
            int n4 = ai[i + 3];
            c[n4] = c[n4] + 1;
            int n5 = ai[i + 4];
            c[n5] = c[n5] + 1;
            int n6 = ai[i + 5];
            c[n6] = c[n6] + 1;
            int n7 = ai[i + 6];
            c[n7] = c[n7] + 1;
            int n8 = ai[i + 7];
            c[n8] = c[n8] + 1;
        }
    }

    private static void countAgg(int[] c, int[] aix, int ai, int len) {
        int i;
        int bn = len % 8;
        for (i = ai; i < ai + bn; ++i) {
            int n = aix[i];
            c[n] = c[n] + 1;
        }
        for (i = ai + bn; i < ai + len; i += 8) {
            int n = aix[i + 0];
            c[n] = c[n] + 1;
            int n2 = aix[i + 1];
            c[n2] = c[n2] + 1;
            int n3 = aix[i + 2];
            c[n3] = c[n3] + 1;
            int n4 = aix[i + 3];
            c[n4] = c[n4] + 1;
            int n5 = aix[i + 4];
            c[n5] = c[n5] + 1;
            int n6 = aix[i + 5];
            c[n6] = c[n6] + 1;
            int n7 = aix[i + 6];
            c[n7] = c[n7] + 1;
            int n8 = aix[i + 7];
            c[n8] = c[n8] + 1;
        }
    }

    private static MatrixBlock transposeSparseToSparseBlock(MatrixBlock in, int rl, int ru) {
        int nRow = in.getNumRows();
        int nCol = in.getNumColumns();
        SparseBlock a = in.getSparseBlock();
        MatrixBlock ret = new MatrixBlock(nCol, ru - rl, true);
        SparseBlockMCSR c = new SparseBlockMCSR(nCol, ru - rl);
        SparseRow[] cs = c.getRows();
        double sp = (double)in.nonZeros / (double)nRow / (double)nCol;
        int est = (int)(sp * (double)(ru - rl));
        for (int i = 0; i < nCol; ++i) {
            c.allocate(i, Math.max(2, est), ru - rl);
        }
        for (int r = rl; r < ru; ++r) {
            if (a.isEmpty(r)) continue;
            int apos = a.pos(r);
            int alen = a.size(r);
            int[] aix = a.indexes(r);
            double[] aval = a.values(r);
            int off = r - rl;
            for (int j = apos; j < apos + alen; ++j) {
                cs[aix[j]] = cs[aix[j]].append(off, aval[j]);
            }
        }
        ret.setSparseBlock(c);
        ret.recomputeNonZeros();
        return ret;
    }

    private static class CopyTask
    implements Callable<Object> {
        private final MatrixBlock _in;
        private final MatrixBlock _out;
        private final int[] _vix;
        private final int _rl;
        private final int _ru;

        protected CopyTask(MatrixBlock in, MatrixBlock out, int[] vix, int rl, int ru) {
            this._in = in;
            this._out = out;
            this._vix = vix;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Object call() {
            int clen = this._in.clen;
            if (!this._out.sparse) {
                DenseBlock a = this._in.getDenseBlock();
                DenseBlock c = this._out.getDenseBlock();
                for (int i = this._rl; i < this._ru; ++i) {
                    System.arraycopy(a.values(this._vix[i]), a.pos(this._vix[i]), c.values(i), c.pos(i), clen);
                }
            } else {
                for (int i = this._rl; i < this._ru; ++i) {
                    if (this._in.sparseBlock.isEmpty(this._vix[i])) continue;
                    this._out.sparseBlock.set(i, this._in.sparseBlock.get(this._vix[i]), false);
                }
            }
            return null;
        }
    }

    private static class MergeTask
    implements Callable<Object> {
        private final int _start;
        private final int _end;
        private final int _blockSize;
        private final int[] _indexes;
        private final double[] _values;

        protected MergeTask(int start, int end, int blockSize, int[] indexes, double[] values) {
            this._start = start;
            this._end = end;
            this._blockSize = blockSize;
            this._indexes = indexes;
            this._values = values;
        }

        @Override
        public Long call() {
            int middle = this._start + this._blockSize;
            if (middle == this._end) {
                return 1L;
            }
            int pointlIndex = middle - 1;
            int positionToAssign = this._end - 1;
            int[] rhsCopy = Arrays.copyOfRange(this._indexes, middle, this._end);
            double[] rhsCopyV = Arrays.copyOfRange(this._values, middle, this._end);
            int pointrIndex = this._end - middle - 1;
            while (positionToAssign >= this._start && pointrIndex >= 0) {
                if (pointrIndex < 0 || pointlIndex >= this._start && this._values[pointlIndex] > rhsCopyV[pointrIndex]) {
                    this._values[positionToAssign] = this._values[pointlIndex];
                    this._indexes[positionToAssign] = this._indexes[pointlIndex];
                    --pointlIndex;
                    --positionToAssign;
                    continue;
                }
                this._values[positionToAssign] = rhsCopyV[pointrIndex];
                this._indexes[positionToAssign] = rhsCopy[pointrIndex];
                --positionToAssign;
                --pointrIndex;
            }
            return 1L;
        }
    }

    private static class SortTask
    implements Callable<Object> {
        private final int _start;
        private final int _end;
        private final int[] _indexes;
        private final double[] _values;

        protected SortTask(int start, int end, int[] indexes, double[] values) {
            this._start = start;
            this._end = end;
            this._indexes = indexes;
            this._values = values;
        }

        @Override
        public Long call() {
            SortUtils.sortByValue(this._start, this._end, this._values, this._indexes);
            return 1L;
        }
    }

    private static class RExpandColsTask
    implements Callable<Long> {
        private final MatrixBlock _in;
        private final MatrixBlock _out;
        private final int _max;
        private final boolean _cast;
        private final boolean _ignore;
        private final int _rl;
        private final int _ru;

        protected RExpandColsTask(MatrixBlock in, MatrixBlock out, int max, boolean cast, boolean ignore, int rl, int ru) {
            this._in = in;
            this._out = out;
            this._max = max;
            this._cast = cast;
            this._ignore = ignore;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Long call() {
            return LibMatrixReorg.rexpandColumns(this._in, this._out, this._max, this._cast, this._ignore, this._rl, this._ru);
        }
    }

    private static class CountNnzTask
    implements Callable<int[]> {
        private MatrixBlock _in = null;
        private int _rl = -1;
        private int _ru = -1;

        protected CountNnzTask(MatrixBlock in, int rl, int ru) {
            this._in = in;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public int[] call() {
            return LibMatrixReorg.countNnzPerColumn(this._in, this._rl, this._ru);
        }
    }

    private static class TransposeTask
    implements Callable<MatrixBlock> {
        private MatrixBlock _in = null;
        private MatrixBlock _out = null;
        private boolean _row = false;
        private int _rl = -1;
        private int _ru = -1;
        private int[] _cnt = null;
        private boolean allowReturnBlock;

        protected TransposeTask(MatrixBlock in, MatrixBlock out, boolean row, int rl, int ru, int[] cnt, boolean returnBlock) {
            this._in = in;
            this._out = out;
            this._row = row;
            this._rl = rl;
            this._ru = ru;
            this._cnt = cnt;
            this.allowReturnBlock = returnBlock;
        }

        @Override
        public MatrixBlock call() {
            int cu;
            int rl = this._row ? this._rl : 0;
            int ru = this._row ? this._ru : this._in.rlen;
            int cl = this._row ? 0 : this._rl;
            int n = cu = this._row ? this._in.clen : this._ru;
            if (!this._in.sparse && !this._out.sparse) {
                LibMatrixReorg.transposeDenseToDense(this._in, this._out, rl, ru, cl, cu);
            } else if (this._in.sparse && this._out.sparse && this._out.sparseBlock instanceof SparseBlockCSR) {
                LibMatrixReorg.transposeSparseToSparseCSR(this._in, this._out, rl, ru, cl, cu, this._cnt);
            } else if (this._in.sparse && this._out.sparse) {
                if (this.allowReturnBlock) {
                    return LibMatrixReorg.transposeSparseToSparseBlock(this._in, rl, ru);
                }
                LibMatrixReorg.transposeSparseToSparse(this._in, this._out, rl, ru, cl, cu, this._cnt);
            } else if (this._in.sparse) {
                LibMatrixReorg.transposeSparseToDense(this._in, this._out, rl, ru, cl, cu);
            } else {
                throw new DMLRuntimeException("Unsupported multi-threaded dense-sparse transpose.");
            }
            return null;
        }
    }

    private static class DescRowComparator
    implements Comparator<Integer> {
        private MatrixBlock _mb = null;
        private int _col = -1;

        public DescRowComparator(MatrixBlock mb, int col) {
            this._mb = mb;
            this._col = col;
        }

        @Override
        public int compare(Integer arg0, Integer arg1) {
            double val1;
            double val0 = this._mb.quickGetValue(arg0, this._col);
            return val0 > (val1 = this._mb.quickGetValue(arg1, this._col)) ? -1 : (val0 == val1 ? 0 : 1);
        }
    }

    private static class AscRowComparator
    implements Comparator<Integer> {
        private MatrixBlock _mb = null;
        private int _col = -1;

        public AscRowComparator(MatrixBlock mb, int col) {
            this._mb = mb;
            this._col = col;
        }

        @Override
        public int compare(Integer arg0, Integer arg1) {
            double val1;
            double val0 = this._mb.quickGetValue(arg0, this._col);
            return val0 < (val1 = this._mb.quickGetValue(arg1, this._col)) ? -1 : (val0 == val1 ? 0 : 1);
        }
    }

    private static class r_invTask
    implements Callable<Object> {
        final double[] _A;
        final int _jStart;
        final int _jEnd;
        final int _b;
        final int _n;
        final int _m;

        r_invTask(double[] A, int jStart, int jEnd, int b, int n, int m) {
            this._A = A;
            this._jStart = jStart;
            this._jEnd = jEnd;
            this._b = b;
            this._n = n;
            this._m = m;
        }

        @Override
        public Object call() {
            double[] tmp = memPool.get();
            if (tmp == null) {
                memPool.set(new double[Math.max(this._m, this._n)]);
                tmp = memPool.get();
            }
            for (int j = this._jStart; j < this._jEnd; ++j) {
                LibMatrixReorg.rj_inv(tmp, this._A, j, this._b, this._n, this._m);
            }
            return null;
        }
    }

    private static class d_invTask
    implements Callable<Object> {
        final double[] _A;
        final int _iStart;
        final int _iEnd;
        final int _a_inv;
        final int _b;
        final int _c;
        final int _n;
        final int _m;

        d_invTask(double[] A, int iStart, int iEnd, int a_inv, int b, int c, int n, int m) {
            this._A = A;
            this._iStart = iStart;
            this._iEnd = iEnd;
            this._a_inv = a_inv;
            this._b = b;
            this._c = c;
            this._n = n;
            this._m = m;
        }

        @Override
        public Object call() {
            double[] tmp = memPool.get();
            if (tmp == null) {
                memPool.set(new double[Math.max(this._m, this._n)]);
                tmp = memPool.get();
            }
            for (int i = this._iStart; i < this._iEnd; ++i) {
                LibMatrixReorg.di_inv_safe(tmp, this._A, i, this._a_inv, this._b, this._c, this._n, this._m);
            }
            return null;
        }
    }

    private static class s_invTask
    implements Callable<Object> {
        final double[] _A;
        final int _jStart;
        final int _jEnd;
        final int _a;
        final int _n;
        final int _m;

        s_invTask(double[] A, int jStart, int jEnd, int a, int n, int m) {
            this._A = A;
            this._jStart = jStart;
            this._jEnd = jEnd;
            this._a = a;
            this._n = n;
            this._m = m;
        }

        @Override
        public Object call() {
            double[] tmp = memPool.get();
            if (tmp == null) {
                memPool.set(new double[Math.max(this._m, this._n)]);
                tmp = memPool.get();
            }
            for (int j = this._jStart; j < this._jEnd; ++j) {
                LibMatrixReorg.sj_inv(tmp, this._A, j, this._a, this._n, this._m);
            }
            return null;
        }
    }

    private static class sTask
    implements Callable<Object> {
        final double[] _A;
        final int _jStart;
        final int _jEnd;
        final int _a;
        final int _n;
        final int _m;

        sTask(double[] A, int jStart, int jEnd, int a, int n, int m) {
            this._A = A;
            this._jStart = jStart;
            this._jEnd = jEnd;
            this._a = a;
            this._n = n;
            this._m = m;
        }

        @Override
        public Object call() {
            double[] tmp = memPool.get();
            if (tmp == null) {
                memPool.set(new double[Math.max(this._m, this._n)]);
                tmp = memPool.get();
            }
            for (int j = this._jStart; j < this._jEnd; ++j) {
                LibMatrixReorg.sj(tmp, this._A, j, this._a, this._n, this._m);
            }
            return null;
        }
    }

    private static class dTask
    implements Callable<Object> {
        final double[] _A;
        final int _iStart;
        final int _iEnd;
        final int _b;
        final int _n;
        final int _m;

        dTask(double[] A, int iStart, int iEnd, int b, int n, int m) {
            this._A = A;
            this._iStart = iStart;
            this._iEnd = iEnd;
            this._b = b;
            this._n = n;
            this._m = m;
        }

        @Override
        public Object call() {
            double[] tmp = memPool.get();
            if (tmp == null) {
                memPool.set(new double[Math.max(this._m, this._n)]);
                tmp = memPool.get();
            }
            for (int i = this._iStart; i < this._iEnd; ++i) {
                LibMatrixReorg.di(tmp, this._A, i, this._b, this._n, this._m);
            }
            return null;
        }
    }

    private static class rTask
    implements Callable<Object> {
        final double[] _A;
        final int _jStart;
        final int _jEnd;
        final int _b;
        final int _n;
        final int _m;

        rTask(double[] A, int jStart, int jEnd, int b, int n, int m) {
            this._A = A;
            this._jStart = jStart;
            this._jEnd = jEnd;
            this._b = b;
            this._n = n;
            this._m = m;
        }

        @Override
        public Object call() {
            double[] tmp = memPool.get();
            if (tmp == null) {
                memPool.set(new double[Math.max(this._m, this._n)]);
                tmp = memPool.get();
            }
            for (int j = this._jStart; j < this._jEnd; ++j) {
                LibMatrixReorg.rj(tmp, this._A, j, this._b, this._n, this._m);
            }
            return null;
        }
    }

    private static class TransposeInPlaceTrivialTask
    implements Callable<Object> {
        private final int _rowStart;
        private final int _rowStop;
        private final int _colStart;
        private final int _colStop;
        private final int _rowAndCols;
        private final double[] _values;

        TransposeInPlaceTrivialTask(int rowStart, int rowStop, int colStart, int colStop, int rowAndCols, double[] values) {
            this._rowStart = rowStart;
            this._rowStop = rowStop;
            this._colStart = colStart;
            this._colStop = colStop;
            this._rowAndCols = rowAndCols;
            this._values = values;
        }

        @Override
        public Object call() {
            for (int rowidx = this._rowStart; rowidx < this._rowStop; ++rowidx) {
                for (int colidx = Math.max(rowidx + 1, this._colStart); colidx < this._colStop; ++colidx) {
                    LibMatrixReorg.swap(this._values, rowidx * this._rowAndCols + colidx, colidx * this._rowAndCols + rowidx);
                }
            }
            return null;
        }
    }

    private static class TransposeDenseToSparseTask
    implements Callable<Object> {
        private DenseBlock a;
        private SparseRowVector[] rows;
        private int rl;
        private int ru;
        private int cl;
        private int cu;

        protected TransposeDenseToSparseTask(DenseBlock a, SparseRowVector[] rows, int rl, int ru, int cl, int cu) {
            this.a = a;
            this.rows = rows;
            this.rl = rl;
            this.ru = ru;
            this.cl = cl;
            this.cu = cu;
        }

        @Override
        public Object call() {
            LibMatrixReorg.transposeDenseToSparseMMRange(this.a, this.rows, this.rl, this.ru, this.cl, this.cu);
            return null;
        }
    }

    private static enum ReorgType {
        TRANSPOSE,
        REV,
        DIAG,
        RESHAPE,
        SORT,
        INVALID;

    }
}

