This page was generated from dialect/graphblas_dialect_tutorials/graphblas_optimize/fuse_multiply_reduce.ipynb.
Fusing graphblas.matrix_multiply with graphblas.reduce_to_scalar_generic¶
This example will go over how to use the --graphblas-structuralize and --graphblas-optimize passes from graphblas-opt to fuse graphblas.matrix_multiply ops with graphblas.reduce_to_scalar ops into graphblas.matrix_multiply_reduce_to_scalar_generic ops.
Let’s first import some necessary libraries.
import tempfile
from mlir_graphblas.cli import GRAPHBLAS_OPT_EXE
Using development graphblas-opt: /Users/pnguyen/code/mlir-graphblas/mlir_graphblas/src/build/bin/graphblas-opt
Since sparse tensor encodings can be very verbose in MLIR, let’s import some helpers to make the MLIR code more readable.
from mlir_graphblas.tools import tersify_mlir
Fusion¶
Here, we’ll show an example of how we can fuse a graphblas.matrix_multiply op with a graphblas.reduce_to_scalar op into a graphblas.matrix_multiply_reduce_to_scalar_generic op. Note that this requires using the --graphblas-structuralize pass prior to using the --graphblas-optimize pass so that the code can be in a state optimizable by the --graphblas-optimize pass. In particular, we need all graphblas.reduce_to_scalar ops lowered into
graphblas.reduce_to_scalar_generic ops.
mlir_text = """
#CSR64 = #sparse_tensor.encoding<{
dimLevelType = [ "dense", "compressed" ],
dimOrdering = affine_map<(i,j) -> (i,j)>,
pointerBitWidth = 64,
indexBitWidth = 64
}>
#CSC64 = #sparse_tensor.encoding<{
dimLevelType = [ "dense", "compressed" ],
dimOrdering = affine_map<(i,j) -> (j,i)>,
pointerBitWidth = 64,
indexBitWidth = 64
}>
func @fuse_adjacent(%A: tensor<?x?xf64, #CSR64>, %B: tensor<?x?xf64, #CSC64>) -> f64 {
%C = graphblas.matrix_multiply %A, %B { semiring = "plus_plus" } : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64>
%reduce_result = graphblas.reduce_to_scalar %C { aggregator = "plus" } : tensor<?x?xf64, #CSR64> to f64
return %reduce_result : f64
}
"""
with tempfile.NamedTemporaryFile() as temp:
temp_file_name = temp.name
with open(temp_file_name, 'w') as f:
f.write(mlir_text)
temp.flush()
output_mlir = ! cat $temp_file_name | $GRAPHBLAS_OPT_EXE --graphblas-structuralize --graphblas-optimize
output_mlir = "\n".join(output_mlir)
output_mlir = tersify_mlir(output_mlir)
print(output_mlir)
#CSR64 = #sparse_tensor.encoding<{
dimLevelType = [ "dense", "compressed" ],
dimOrdering = affine_map<(d0, d1) -> (d0, d1)>,
pointerBitWidth = 64,
indexBitWidth = 64
}>
#CSC64 = #sparse_tensor.encoding<{
dimLevelType = [ "dense", "compressed" ],
dimOrdering = affine_map<(d0, d1) -> (d1, d0)>,
pointerBitWidth = 64,
indexBitWidth = 64
}>
module {
func @fuse_adjacent(%arg0: tensor<?x?xf64, #CSR64>, %arg1: tensor<?x?xf64, #CSC64>) -> f64 {
%cst = arith.constant 0.000000e+00 : f64
%0 = graphblas.matrix_multiply_reduce_to_scalar_generic %arg0, %arg1 {mask_complement = false} : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to f64 {
graphblas.yield add_identity %cst : f64
}, {
^bb0(%arg2: f64, %arg3: f64):
%1 = arith.addf %arg2, %arg3 : f64
graphblas.yield add %1 : f64
}, {
^bb0(%arg2: f64, %arg3: f64):
%1 = arith.addf %arg2, %arg3 : f64
graphblas.yield mult %1 : f64
}, {
graphblas.yield agg_identity %cst : f64
}, {
^bb0(%arg2: f64, %arg3: f64):
%1 = arith.addf %arg2, %arg3 : f64
graphblas.yield agg %1 : f64
}
return %0 : f64
}
}
Non-applicable Fusion¶
One thing to note is that if the result of any intermediate values of the ops being fused, e.g. the result of a graphblas.matrix_multiply, is used elsewhere, the fusion cannot and will not apply as shown here.
mlir_text = """
#CSR64 = #sparse_tensor.encoding<{
dimLevelType = [ "dense", "compressed" ],
dimOrdering = affine_map<(i,j) -> (i,j)>,
pointerBitWidth = 64,
indexBitWidth = 64
}>
#CSC64 = #sparse_tensor.encoding<{
dimLevelType = [ "dense", "compressed" ],
dimOrdering = affine_map<(i,j) -> (j,i)>,
pointerBitWidth = 64,
indexBitWidth = 64
}>
func @nofuse_multi_use(%A: tensor<?x?xf64, #CSR64>, %B: tensor<?x?xf64, #CSC64>) -> (f64, tensor<?x?xf64, #CSR64>) {
%C = graphblas.matrix_multiply %A, %B { semiring = "plus_plus" } : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64>
%reduce_result = graphblas.reduce_to_scalar %C { aggregator = "plus" } : tensor<?x?xf64, #CSR64> to f64
return %reduce_result, %C : f64, tensor<?x?xf64, #CSR64>
}
"""
with tempfile.NamedTemporaryFile() as temp:
temp_file_name = temp.name
with open(temp_file_name, 'w') as f:
f.write(mlir_text)
temp.flush()
output_mlir = ! cat $temp_file_name | $GRAPHBLAS_OPT_EXE --graphblas-structuralize --graphblas-optimize
output_mlir = "\n".join(output_mlir)
output_mlir = tersify_mlir(output_mlir)
print(output_mlir)
#CSR64 = #sparse_tensor.encoding<{
dimLevelType = [ "dense", "compressed" ],
dimOrdering = affine_map<(d0, d1) -> (d0, d1)>,
pointerBitWidth = 64,
indexBitWidth = 64
}>
#CSC64 = #sparse_tensor.encoding<{
dimLevelType = [ "dense", "compressed" ],
dimOrdering = affine_map<(d0, d1) -> (d1, d0)>,
pointerBitWidth = 64,
indexBitWidth = 64
}>
module {
func @nofuse_multi_use(%arg0: tensor<?x?xf64, #CSR64>, %arg1: tensor<?x?xf64, #CSC64>) -> (f64, tensor<?x?xf64, #CSR64>) {
%cst = arith.constant 0.000000e+00 : f64
%0 = graphblas.matrix_multiply_generic %arg0, %arg1 {mask_complement = false} : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64> {
graphblas.yield add_identity %cst : f64
}, {
^bb0(%arg2: f64, %arg3: f64):
%2 = arith.addf %arg2, %arg3 : f64
graphblas.yield add %2 : f64
}, {
^bb0(%arg2: f64, %arg3: f64):
%2 = arith.addf %arg2, %arg3 : f64
graphblas.yield mult %2 : f64
}
%1 = graphblas.reduce_to_scalar_generic %0 : tensor<?x?xf64, #CSR64> to f64 {
graphblas.yield agg_identity %cst : f64
}, {
^bb0(%arg2: f64, %arg3: f64):
%2 = arith.addf %arg2, %arg3 : f64
graphblas.yield agg %2 : f64
}
return %1, %0 : f64, tensor<?x?xf64, #CSR64>
}
}