This page was generated from dialect/graphblas_dialect_tutorials/graphblas_optimize/fuse_multiply_reduce.ipynb.

Fusing graphblas.matrix_multiply with graphblas.reduce_to_scalar_generic¶

This example will go over how to use the --graphblas-structuralize and --graphblas-optimize passes from graphblas-opt to fuse graphblas.matrix_multiply ops with graphblas.reduce_to_scalar ops into graphblas.matrix_multiply_reduce_to_scalar_generic ops.

Let’s first import some necessary libraries.

import tempfile
from mlir_graphblas.cli import GRAPHBLAS_OPT_EXE

Using development graphblas-opt: /Users/pnguyen/code/mlir-graphblas/mlir_graphblas/src/build/bin/graphblas-opt

Since sparse tensor encodings can be very verbose in MLIR, let’s import some helpers to make the MLIR code more readable.

from mlir_graphblas.tools import tersify_mlir

Fusion¶

Here, we’ll show an example of how we can fuse a graphblas.matrix_multiply op with a graphblas.reduce_to_scalar op into a graphblas.matrix_multiply_reduce_to_scalar_generic op. Note that this requires using the --graphblas-structuralize pass prior to using the --graphblas-optimize pass so that the code can be in a state optimizable by the --graphblas-optimize pass. In particular, we need all graphblas.reduce_to_scalar ops lowered into graphblas.reduce_to_scalar_generic ops.

mlir_text = """
#CSR64 = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
  dimOrdering = affine_map<(i,j) -> (i,j)>,
  pointerBitWidth = 64,
  indexBitWidth = 64
}>

#CSC64 = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
  dimOrdering = affine_map<(i,j) -> (j,i)>,
  pointerBitWidth = 64,
  indexBitWidth = 64
}>

func @fuse_adjacent(%A: tensor<?x?xf64, #CSR64>, %B: tensor<?x?xf64, #CSC64>) -> f64 {
    %C = graphblas.matrix_multiply %A, %B { semiring = "plus_plus" } : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64>
    %reduce_result = graphblas.reduce_to_scalar %C { aggregator = "plus" } : tensor<?x?xf64, #CSR64> to f64
    return %reduce_result : f64
}
"""

with tempfile.NamedTemporaryFile() as temp:
    temp_file_name = temp.name
    with open(temp_file_name, 'w') as f:
        f.write(mlir_text)
    temp.flush()

    output_mlir = ! cat $temp_file_name | $GRAPHBLAS_OPT_EXE --graphblas-structuralize --graphblas-optimize
    output_mlir = "\n".join(output_mlir)
    output_mlir = tersify_mlir(output_mlir)

print(output_mlir)

#CSR64 = #sparse_tensor.encoding<{
    dimLevelType = [ "dense", "compressed" ],
    dimOrdering = affine_map<(d0, d1) -> (d0, d1)>,
    pointerBitWidth = 64,
    indexBitWidth = 64
}>

#CSC64 = #sparse_tensor.encoding<{
    dimLevelType = [ "dense", "compressed" ],
    dimOrdering = affine_map<(d0, d1) -> (d1, d0)>,
    pointerBitWidth = 64,
    indexBitWidth = 64
}>

module {
  func @fuse_adjacent(%arg0: tensor<?x?xf64, #CSR64>, %arg1: tensor<?x?xf64, #CSC64>) -> f64 {
    %cst = arith.constant 0.000000e+00 : f64
    %0 = graphblas.matrix_multiply_reduce_to_scalar_generic %arg0, %arg1 {mask_complement = false} : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to f64 {
      graphblas.yield add_identity %cst : f64
    }, {
    ^bb0(%arg2: f64, %arg3: f64):
      %1 = arith.addf %arg2, %arg3 : f64
      graphblas.yield add %1 : f64
    }, {
    ^bb0(%arg2: f64, %arg3: f64):
      %1 = arith.addf %arg2, %arg3 : f64
      graphblas.yield mult %1 : f64
    }, {
      graphblas.yield agg_identity %cst : f64
    }, {
    ^bb0(%arg2: f64, %arg3: f64):
      %1 = arith.addf %arg2, %arg3 : f64
      graphblas.yield agg %1 : f64
    }
    return %0 : f64
  }
}

Non-applicable Fusion¶

One thing to note is that if the result of any intermediate values of the ops being fused, e.g. the result of a graphblas.matrix_multiply, is used elsewhere, the fusion cannot and will not apply as shown here.

mlir_text = """
#CSR64 = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
  dimOrdering = affine_map<(i,j) -> (i,j)>,
  pointerBitWidth = 64,
  indexBitWidth = 64
}>

#CSC64 = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
  dimOrdering = affine_map<(i,j) -> (j,i)>,
  pointerBitWidth = 64,
  indexBitWidth = 64
}>

func @nofuse_multi_use(%A: tensor<?x?xf64, #CSR64>, %B: tensor<?x?xf64, #CSC64>) -> (f64, tensor<?x?xf64, #CSR64>) {
    %C = graphblas.matrix_multiply %A, %B { semiring = "plus_plus" } : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64>
    %reduce_result = graphblas.reduce_to_scalar %C { aggregator = "plus" } : tensor<?x?xf64, #CSR64> to f64
    return %reduce_result, %C : f64, tensor<?x?xf64, #CSR64>
}
"""

with tempfile.NamedTemporaryFile() as temp:
    temp_file_name = temp.name
    with open(temp_file_name, 'w') as f:
        f.write(mlir_text)
    temp.flush()

    output_mlir = ! cat $temp_file_name | $GRAPHBLAS_OPT_EXE --graphblas-structuralize --graphblas-optimize
    output_mlir = "\n".join(output_mlir)
    output_mlir = tersify_mlir(output_mlir)

print(output_mlir)

#CSR64 = #sparse_tensor.encoding<{
    dimLevelType = [ "dense", "compressed" ],
    dimOrdering = affine_map<(d0, d1) -> (d0, d1)>,
    pointerBitWidth = 64,
    indexBitWidth = 64
}>

#CSC64 = #sparse_tensor.encoding<{
    dimLevelType = [ "dense", "compressed" ],
    dimOrdering = affine_map<(d0, d1) -> (d1, d0)>,
    pointerBitWidth = 64,
    indexBitWidth = 64
}>

module {
  func @nofuse_multi_use(%arg0: tensor<?x?xf64, #CSR64>, %arg1: tensor<?x?xf64, #CSC64>) -> (f64, tensor<?x?xf64, #CSR64>) {
    %cst = arith.constant 0.000000e+00 : f64
    %0 = graphblas.matrix_multiply_generic %arg0, %arg1 {mask_complement = false} : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64> {
      graphblas.yield add_identity %cst : f64
    }, {
    ^bb0(%arg2: f64, %arg3: f64):
      %2 = arith.addf %arg2, %arg3 : f64
      graphblas.yield add %2 : f64
    }, {
    ^bb0(%arg2: f64, %arg3: f64):
      %2 = arith.addf %arg2, %arg3 : f64
      graphblas.yield mult %2 : f64
    }
    %1 = graphblas.reduce_to_scalar_generic %0 : tensor<?x?xf64, #CSR64> to f64 {
      graphblas.yield agg_identity %cst : f64
    }, {
    ^bb0(%arg2: f64, %arg3: f64):
      %2 = arith.addf %arg2, %arg3 : f64
      graphblas.yield agg %2 : f64
    }
    return %1, %0 : f64, tensor<?x?xf64, #CSR64>
  }
}

GraphBLAS Optimizing Pass Fusing graphblas.matrix_multiply with graphblas.apply