From a91a5886e5f34d74eb1f012048767d5c1217ffff Mon Sep 17 00:00:00 2001
From: Corey Zumar <czumar@berkeley.edu>
Date: Thu, 22 Mar 2018 13:10:58 -0700
Subject: [PATCH 1/5] Mixed fusion

---
 mixed_weights_fusion.py | 105 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 mixed_weights_fusion.py

diff --git a/mixed_weights_fusion.py b/mixed_weights_fusion.py
new file mode 100644
index 0000000..178239d
--- /dev/null
+++ b/mixed_weights_fusion.py
@@ -0,0 +1,105 @@
+import tensorflow as tf
+import numpy as np
+
+# Code that attempts to find candidate layers to merge together for inference time
+# worthy experiment: How does cuBLAS matrix multiply performance scale with matrix size?
+# Hard constraints: GPU multiply unit size (12GB), multiplication precision
+# Soft constraints: For small models, we could probably run quite a few kernels
+
+# TODO: Remove layers with unused output nodes, 
+
+# vertical layer fusion is probably just making a bunch of ops work in a single CUDA program
+
+# horizontal layer fusion most optimal when you take the same source tensor
+
+# get colocated models
+
+# iterate through model graphs and categorize them
+
+# iterate through categorized layers and evaluate them for combination
+
+# 
+
+def fuse_distinct_matmul(graph, op1_name, op2_name):
+	"""
+	op1 : tf.Tensor
+		The first matrix multiplication operation
+	op2 : tf.Tensor
+		The second matrix multiplication operation
+	"""
+
+	with tf.Session() as sess:
+		tf.import_graph_def(graph, name="")
+
+		print(tf.get_default_graph().get_operations())
+
+		op1 = tf.get_default_graph().get_operation_by_name(op1_name)
+		op2 = tf.get_default_graph().get_operation_by_name(op2_name)
+
+		left1, right1 = op1.inputs
+		left2, right2 = op2.inputs
+
+		sess.run(tf.global_variables_initializer())
+
+		weights1 = sess.run(left1)
+		weights2 = sess.run(left2)
+
+		x1, y1 = weights1.shape
+		x2, y2 = weights2.shape
+
+		s = min(y1, y2)
+		column_diff = abs(y2 - y1)
+		inputs_padding = tf.constant([[0, column_diff], [0, 0]])
+
+		if column_diff > 0:
+			if y1 == s:
+				weights_padding = np.zeros((x1, column_diff))
+				weights1 = np.concatenate((weights1, weights_padding), axis=1)
+				right1 = tf.pad(right1, inputs_padding)
+			elif y2 == s:
+				weights_padding = np.zeros((x2, column_diff))
+				weights2 = np.concatenate((weights2, weights_padding), axis=1)
+				right2 = tf.pad(right2, inputs_padding)
+
+		merged_weights = np.array(np.concatenate((weights1, weights2), axis=0), dtype=np.float32)
+		merged_tensor = tf.Variable(merged_weights)
+
+		print(merged_tensor.get_shape())
+
+		merged_matmul = tf.matmul(merged_tensor, tf.concat([right1, right2], axis=1))
+
+		right1_shape = tf.shape(right1)
+		right2_shape = tf.shape(right2)
+
+		new_op1, new_op2 = tf.split(merged_matmul, [x1, x2], axis=0)
+		new_op1, _ = tf.split(new_op1, [right1_shape[0], right2_shape[0]], axis=1)
+		_, new_op2 = tf.split(new_op2, [right1_shape[0], right2_shape[0]], axis=1)
+
+		print(op1.outputs)
+
+
+if __name__ == "__main__":
+	with tf.Session() as sess:
+		with tf.variable_scope("TEST"):
+			w_vals = np.zeros((12, 10), dtype=np.float32)
+			w_vals[0][0] = 8.2
+			w = tf.Variable(w_vals)
+			x = tf.Variable(tf.zeros([4, 11]))
+			y = tf.zeros([10, 9])
+			b = tf.zeros([11, 8])
+			z = tf.matmul(x,b)
+			a = tf.matmul(w,y)
+
+			print(z.shape)
+			print(a.shape)
+
+			sess.run(tf.global_variables_initializer())
+
+			frozen_graph = tf.graph_util.convert_variables_to_constants(
+				sess, tf.get_default_graph().as_graph_def(), ["TEST/MatMul", "TEST/MatMul_1"])
+
+			#print(frozen_graph.node)
+
+			fuse_distinct_matmul(frozen_graph, "TEST/MatMul", "TEST/MatMul_1")
+
+

From 964fceb33ef69e8403922f886314c5e17dd15d2b Mon Sep 17 00:00:00 2001
From: Corey Zumar <czumar@berkeley.edu>
Date: Thu, 22 Mar 2018 13:15:23 -0700
Subject: [PATCH 2/5] paramfix

---
 layer_fusion.py         | 85 +++++++++++++++++++++++++++++++++++++++++
 mixed_weights_fusion.py |  8 ++--
 2 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/layer_fusion.py b/layer_fusion.py
index b921d3c..178239d 100644
--- a/layer_fusion.py
+++ b/layer_fusion.py
@@ -1,4 +1,5 @@
 import tensorflow as tf
+import numpy as np
 
 # Code that attempts to find candidate layers to merge together for inference time
 # worthy experiment: How does cuBLAS matrix multiply performance scale with matrix size?
@@ -18,3 +19,87 @@
 # iterate through categorized layers and evaluate them for combination
 
 # 
+
+def fuse_distinct_matmul(graph, op1_name, op2_name):
+	"""
+	op1 : tf.Tensor
+		The first matrix multiplication operation
+	op2 : tf.Tensor
+		The second matrix multiplication operation
+	"""
+
+	with tf.Session() as sess:
+		tf.import_graph_def(graph, name="")
+
+		print(tf.get_default_graph().get_operations())
+
+		op1 = tf.get_default_graph().get_operation_by_name(op1_name)
+		op2 = tf.get_default_graph().get_operation_by_name(op2_name)
+
+		left1, right1 = op1.inputs
+		left2, right2 = op2.inputs
+
+		sess.run(tf.global_variables_initializer())
+
+		weights1 = sess.run(left1)
+		weights2 = sess.run(left2)
+
+		x1, y1 = weights1.shape
+		x2, y2 = weights2.shape
+
+		s = min(y1, y2)
+		column_diff = abs(y2 - y1)
+		inputs_padding = tf.constant([[0, column_diff], [0, 0]])
+
+		if column_diff > 0:
+			if y1 == s:
+				weights_padding = np.zeros((x1, column_diff))
+				weights1 = np.concatenate((weights1, weights_padding), axis=1)
+				right1 = tf.pad(right1, inputs_padding)
+			elif y2 == s:
+				weights_padding = np.zeros((x2, column_diff))
+				weights2 = np.concatenate((weights2, weights_padding), axis=1)
+				right2 = tf.pad(right2, inputs_padding)
+
+		merged_weights = np.array(np.concatenate((weights1, weights2), axis=0), dtype=np.float32)
+		merged_tensor = tf.Variable(merged_weights)
+
+		print(merged_tensor.get_shape())
+
+		merged_matmul = tf.matmul(merged_tensor, tf.concat([right1, right2], axis=1))
+
+		right1_shape = tf.shape(right1)
+		right2_shape = tf.shape(right2)
+
+		new_op1, new_op2 = tf.split(merged_matmul, [x1, x2], axis=0)
+		new_op1, _ = tf.split(new_op1, [right1_shape[0], right2_shape[0]], axis=1)
+		_, new_op2 = tf.split(new_op2, [right1_shape[0], right2_shape[0]], axis=1)
+
+		print(op1.outputs)
+
+
+if __name__ == "__main__":
+	with tf.Session() as sess:
+		with tf.variable_scope("TEST"):
+			w_vals = np.zeros((12, 10), dtype=np.float32)
+			w_vals[0][0] = 8.2
+			w = tf.Variable(w_vals)
+			x = tf.Variable(tf.zeros([4, 11]))
+			y = tf.zeros([10, 9])
+			b = tf.zeros([11, 8])
+			z = tf.matmul(x,b)
+			a = tf.matmul(w,y)
+
+			print(z.shape)
+			print(a.shape)
+
+			sess.run(tf.global_variables_initializer())
+
+			frozen_graph = tf.graph_util.convert_variables_to_constants(
+				sess, tf.get_default_graph().as_graph_def(), ["TEST/MatMul", "TEST/MatMul_1"])
+
+			#print(frozen_graph.node)
+
+			fuse_distinct_matmul(frozen_graph, "TEST/MatMul", "TEST/MatMul_1")
+
+
diff --git a/mixed_weights_fusion.py b/mixed_weights_fusion.py
index 178239d..e0a6935 100644
--- a/mixed_weights_fusion.py
+++ b/mixed_weights_fusion.py
@@ -22,10 +22,10 @@
 
 def fuse_distinct_matmul(graph, op1_name, op2_name):
 	"""
-	op1 : tf.Tensor
-		The first matrix multiplication operation
-	op2 : tf.Tensor
-		The second matrix multiplication operation
+	op1 : str
+		The name of the first matrix multiplication operation
+	op2 : str
+		The name of the second matrix multiplication operation
 	"""
 
 	with tf.Session() as sess:

From cbf2beb3c6c810949174b104eb72c81ea729d26b Mon Sep 17 00:00:00 2001
From: Corey Zumar <czumar@berkeley.edu>
Date: Sun, 1 Apr 2018 23:29:52 -0700
Subject: [PATCH 3/5] Fix merged fusion and add profiling step

---
 mixed_weights_fusion.py | 104 ++++++++++++++++++++++++++++++----------
 1 file changed, 79 insertions(+), 25 deletions(-)

diff --git a/mixed_weights_fusion.py b/mixed_weights_fusion.py
index e0a6935..eb11a6b 100644
--- a/mixed_weights_fusion.py
+++ b/mixed_weights_fusion.py
@@ -1,5 +1,8 @@
 import tensorflow as tf
 import numpy as np
+import itertools
+
+from datetime import datetime
 
 # Code that attempts to find candidate layers to merge together for inference time
 # worthy experiment: How does cuBLAS matrix multiply performance scale with matrix size?
@@ -31,8 +34,6 @@ def fuse_distinct_matmul(graph, op1_name, op2_name):
 	with tf.Session() as sess:
 		tf.import_graph_def(graph, name="")
 
-		print(tf.get_default_graph().get_operations())
-
 		op1 = tf.get_default_graph().get_operation_by_name(op1_name)
 		op2 = tf.get_default_graph().get_operation_by_name(op2_name)
 
@@ -64,42 +65,95 @@ def fuse_distinct_matmul(graph, op1_name, op2_name):
 		merged_weights = np.array(np.concatenate((weights1, weights2), axis=0), dtype=np.float32)
 		merged_tensor = tf.Variable(merged_weights)
 
-		print(merged_tensor.get_shape())
-
 		merged_matmul = tf.matmul(merged_tensor, tf.concat([right1, right2], axis=1))
 
 		right1_shape = tf.shape(right1)
 		right2_shape = tf.shape(right2)
 
 		new_op1, new_op2 = tf.split(merged_matmul, [x1, x2], axis=0)
-		new_op1, _ = tf.split(new_op1, [right1_shape[0], right2_shape[0]], axis=1)
-		_, new_op2 = tf.split(new_op2, [right1_shape[0], right2_shape[0]], axis=1)
 
-		print(op1.outputs)
+		new_op1, _ = tf.split(new_op1, [right1_shape[1], right2_shape[1]], axis=1)
+		_, new_op2 = tf.split(new_op2, [right1_shape[1], right2_shape[1]], axis=1)
 
+		return new_op1, new_op2
 
-if __name__ == "__main__":
-	with tf.Session() as sess:
-		with tf.variable_scope("TEST"):
-			w_vals = np.zeros((12, 10), dtype=np.float32)
-			w_vals[0][0] = 8.2
-			w = tf.Variable(w_vals)
-			x = tf.Variable(tf.zeros([4, 11]))
-			y = tf.zeros([10, 9])
-			b = tf.zeros([11, 8])
-			z = tf.matmul(x,b)
-			a = tf.matmul(w,y)
 
-			print(z.shape)
-			print(a.shape)
+def evaluate_matmul():
+	sizes = [(256 * 256, 500), (128 * 128, 200), (64 * 64, 100)]
+
+	all_sizes = itertools.product(sizes, sizes)
+
+	idx = 0
+	for s1, s2 in all_sizes:
+		# if idx == 0:
+		# 	idx += 1
+		# 	continue
+		m1, n1 = s1
+		m2, n2 = s2
+		with tf.Session() as sess:
+			with tf.variable_scope("TEST"):
+				w1 = tf.Variable(np.array(np.random.rand(m1, n1), dtype=np.float32))
+				w2 = tf.Variable(np.array(np.random.rand(m2, n2), dtype=np.float32))
+				
+				inp1 = tf.placeholder(tf.float32, [n1, None])
+				inp2 = tf.placeholder(tf.float32, [n2, None])
+
+				result1 = tf.matmul(w1, inp1)
+				result2 = tf.matmul(w2, inp2)
+				
+				sess.run(tf.global_variables_initializer())
 
-			sess.run(tf.global_variables_initializer())
+				frozen_graph = tf.graph_util.convert_variables_to_constants(
+					sess, tf.get_default_graph().as_graph_def(), ["TEST/MatMul", "TEST/MatMul_1"])
 
-			frozen_graph = tf.graph_util.convert_variables_to_constants(
-				sess, tf.get_default_graph().as_graph_def(), ["TEST/MatMul", "TEST/MatMul_1"])
+				new_op1, new_op2 = fuse_distinct_matmul(frozen_graph, "TEST/MatMul", "TEST/MatMul_1")
 
-			#print(frozen_graph.node)
+				sess.run(tf.global_variables_initializer())
+		
+			feed_dict = {
+					inp1 : np.random.rand(n1,1),
+					inp2 : np.random.rand(n2,1)
+				    }
+		
+			before = datetime.now()
 
-			fuse_distinct_matmul(frozen_graph, "TEST/MatMul", "TEST/MatMul_1")
+			out1, out2 = sess.run([new_op1, new_op2], feed_dict=feed_dict)
+
+			mid = datetime.now()
+
+			out1, out2 = sess.run([result1, result2], feed_dict=feed_dict)
+
+			after = datetime.now()
+
+			merged_lat = (mid - before).total_seconds()
+			iso_lat = (after - mid).total_seconds()
+
+			print(merged_lat, iso_lat)
+			break
+
+if __name__ == "__main__":
+	evaluate_matmul()
+	# with tf.Session() as sess:
+	# 	with tf.variable_scope("TEST"):
+	# 		w_vals = np.zeros((12, 10), dtype=np.float32)
+	# 		w_vals[0][0] = 8.2
+	# 		w = tf.Variable(w_vals)
+	# 		x = tf.Variable(tf.zeros([4, 11]))
+	# 		y = tf.zeros([10, 9])
+	# 		b = tf.zeros([11, 8])
+	# 		z = tf.matmul(x,b)
+	# 		a = tf.matmul(w,y)
+        #
+	# 		print(z.shape)
+	# 		print(a.shape)
+        #
+	# 		sess.run(tf.global_variables_initializer())
+        #
+	# 		frozen_graph = tf.graph_util.convert_variables_to_constants(
+	# 			sess, tf.get_default_graph().as_graph_def(), ["TEST/MatMul", "TEST/MatMul_1"])
+        #
+	# 		#print(frozen_graph.node)
+        #
+	# 		fuse_distinct_matmul(frozen_graph, "TEST/MatMul", "TEST/MatMul_1")
 
 

From 2edb0c7c6fd0035d437099f8449e5229ea9c5fe4 Mon Sep 17 00:00:00 2001
From: Corey Zumar <czumar@berkeley.edu>
Date: Sun, 1 Apr 2018 23:48:08 -0700
Subject: [PATCH 4/5] GPU support

---
 mixed_weights_fusion.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mixed_weights_fusion.py b/mixed_weights_fusion.py
index eb11a6b..2299ccc 100644
--- a/mixed_weights_fusion.py
+++ b/mixed_weights_fusion.py
@@ -85,9 +85,9 @@ def evaluate_matmul():
 
 	idx = 0
 	for s1, s2 in all_sizes:
-		# if idx == 0:
-		# 	idx += 1
-		# 	continue
+		if idx == 0:
+			idx += 1
+			continue
 		m1, n1 = s1
 		m2, n2 = s2
 		with tf.Session() as sess:

From 06adbbfb147bd867298b990c96636c3c6b9b4877 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-15-152.us-west-2.compute.internal>
Date: Mon, 2 Apr 2018 06:48:52 +0000
Subject: [PATCH 5/5] GPU fixes

---
 mixed_weights_fusion.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mixed_weights_fusion.py b/mixed_weights_fusion.py
index eb11a6b..44a0f79 100644
--- a/mixed_weights_fusion.py
+++ b/mixed_weights_fusion.py
@@ -23,7 +23,7 @@
 
 # 
 
-def fuse_distinct_matmul(graph, op1_name, op2_name):
+def fuse_distinct_matmul(sess, graph, op1_name, op2_name):
 	"""
 	op1 : str
 		The name of the first matrix multiplication operation
@@ -31,7 +31,7 @@ def fuse_distinct_matmul(graph, op1_name, op2_name):
 		The name of the second matrix multiplication operation
 	"""
 
-	with tf.Session() as sess:
+	with tf.device("/gpu:0"): 
 		tf.import_graph_def(graph, name="")
 
 		op1 = tf.get_default_graph().get_operation_by_name(op1_name)
@@ -90,7 +90,8 @@ def evaluate_matmul():
 		# 	continue
 		m1, n1 = s1
 		m2, n2 = s2
-		with tf.Session() as sess:
+		with tf.device("/gpu:0"):
+			sess = tf.Session()
 			with tf.variable_scope("TEST"):
 				w1 = tf.Variable(np.array(np.random.rand(m1, n1), dtype=np.float32))
 				w2 = tf.Variable(np.array(np.random.rand(m2, n2), dtype=np.float32))
@@ -106,7 +107,7 @@ def evaluate_matmul():
 				frozen_graph = tf.graph_util.convert_variables_to_constants(
 					sess, tf.get_default_graph().as_graph_def(), ["TEST/MatMul", "TEST/MatMul_1"])
 
-				new_op1, new_op2 = fuse_distinct_matmul(frozen_graph, "TEST/MatMul", "TEST/MatMul_1")
+				new_op1, new_op2 = fuse_distinct_matmul(sess, frozen_graph, "TEST/MatMul", "TEST/MatMul_1")
 
 				sess.run(tf.global_variables_initializer())