markp-gc · bertiethorpe · Sep 2, 2022 · markp-gc · Sep 6, 2022 · markp-gc
diff --git a/python/fft/perf_analysis.py b/python/fft/perf_analysis.py
@@ -72,7 +72,7 @@ def getFFTInfoFromLog(log_file):
     if args.csv_out:
       print("Writing CSV headers")
       with open(args.csv_out, "w") as f:
-        f.write(f"Input-size,Batch-size,Radix-size,DFT Batch-size,Cycles,FLOPS Estimate,FLOPS/Cycle,GFLOPS/sec,Memory Including Gaps (bytes),Peak Live Memory Step, Peak Live Memory (bytes)\n")
+        f.write(f"Input-size,Batch-size,Radix-size,DFT Batch-size,Cycles,FLOPS Estimate,FLOPS/Cycle,GFLOPS/sec,Memory Including Gaps (bytes)\n")
       exit()
     else:
       raise RuntimeError("Can't write headers: no output file specified.")
@@ -83,9 +83,6 @@ def getFFTInfoFromLog(log_file):
   total_memory = sum(tile.memory.total.includingGaps for tile in report.compilation.tiles)
   print(f"Total memory use (bytes): {total_memory}")
 
-  peak_name, peak_live_memory = getPeakLivenessStep(report)
-  print(f"Program step consuming peak memory: {peak_name} {peak_live_memory}")
-
   size, bs, radix, cycles, flops, dft_batch_size = getFFTInfoFromLog(args.log_file)
   flops_per_cycle = flops/cycles if flops else None
   gflops_per_second = flops_per_cycle * args.clock_speed_ghz if flops_per_cycle else None
@@ -101,4 +98,4 @@ def getFFTInfoFromLog(log_file):
   # Collate everything into one line of CSV and append to file if specififed:
   if args.csv_out:
     with open(args.csv_out, "a") as f:
-      f.write(f"{size},{bs},{radix},{dft_batch_size},{cycles},{flops},{flops_per_cycle},{gflops_per_second},{total_memory},{peak_name},{peak_live_memory}\n")
+      f.write(f"{size},{bs},{radix},{dft_batch_size},{cycles},{flops},{flops_per_cycle},{gflops_per_second},{total_memory}\n")
diff --git a/python/fft/sweep2d.sh b/python/fft/sweep2d.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
+
+RUN_DIR="2d_profiles"
+mkdir -p $RUN_DIR
+CSV_FILE=${RUN_DIR}/"2d_csv_results_2.txt"
+
+for SIZE in 256 512 1024 2048
+do
+  for BS in 1 4 8 16 32 64
+  do
+    for RADIX in 2 4 8 16 32 64 128 
+    do
+      RUN_NAME="fft_2d_${SIZE}_bs${BS}_radix${RADIX}"
+      export POPLAR_ENGINE_OPTIONS="{\"autoReport.all\":\"true\", \"autoReport.directory\":\"${RUN_DIR}/${RUN_NAME}\", \"profiler.includeFlopEstimates\":\"true\"}"
+      echo "Running size: ${SIZE} batch-size: ${BS} radix: ${RADIX}"
+      mkdir -p ${RUN_DIR}/${RUN_NAME}
+      ./multi-tool FourierTransform2D --fft-size ${SIZE} --batch-size ${BS} --radix-size ${RADIX} > ${RUN_DIR}/${RUN_NAME}/run_log.txt &
+    done
+    wait
+  done
+done
+
+# Overwrite the CSV file writing new headers:
+python3 ../python/fft/perf_analysis.py --report-file fake --log-file fake --csv-out ${CSV_FILE} --csv-write-headers
+
+# Append all run results to CSV file:
+for DIR in $RUN_DIR/*
+do
+  echo Processing path $DIR
+  LOG_FILE=$DIR/run_log.txt
+  REPORT_FILE=$DIR/ipu_utils_engine/profile.pop
+  python3 ../python/fft/perf_analysis.py --report-file ${REPORT_FILE} --log-file ${LOG_FILE} --csv-out ${CSV_FILE}
+done
diff --git a/src/fft/complex.cpp b/src/fft/complex.cpp
@@ -55,13 +55,10 @@ namespace complex {
     namespace pe = popops::expr;
     auto complexMulExprRe = pe::Sub(pe::Mul(pe::_1, pe::_2), pe::Mul(pe::_3, pe::_4));
     auto complexMulExprIm = pe::Add(pe::Mul(pe::_1, pe::_2), pe::Mul(pe::_3, pe::_4));
-
-    // Can only do the second expression in-place:
-    auto tmpReal = popops::map(graph, complexMulExprRe, {real, v.real, imag, v.imag},
-                               prog, debugPrefix + "/complex_mul_re");
+    popops::mapInPlace(graph, complexMulExprRe, {real, v.real, imag, v.imag},
+                       prog, debugPrefix + "/complex_mul_re");
     popops::mapInPlace(graph, complexMulExprIm, {imag, v.real, real, v.imag},
                        prog, debugPrefix + "/complex_mul_im");
-    real = tmpReal;
   }
 
   ComplexTensor multiply(poplar::Graph& graph,
@@ -129,17 +126,12 @@ namespace complex {
     graph.setTileMapping(matrix.real, graph.getTileMapping(matmulMapping));
     graph.setTileMapping(matrix.imag, graph.getTileMapping(matmulMapping));
 
-    poplar::OptionFlags matmulOptions;
-    if (availableMemoryProportion > 0.f) {
-      matmulOptions.set("availableMemoryProportion", std::to_string(availableMemoryProportion));
-    }
-
     poplar::Tensor partial =
       poplin::matMul(graph, matrix.real, realBatch, prog,
-                     elemType, debugStr + "/real_matmul", matmulOptions);
+                     elemType, debugStr + "/real_matmul");
 
     poplin::matMulAcc(graph, partial, 1.f, matrix.imag, imagBatch, prog,
-                      debugStr + "/imag_matmul", matmulOptions);
+                      debugStr + "/imag_matmul");
 
     // FLOP estimates for matrix multiplies:
     flopEstimate += 2 * matrix.dim(0) * matrix.dim(1) * realBatch.dim(1) * 2;
@@ -219,25 +211,21 @@ namespace complex {
     auto result_odd = fftSubResult.transpose().slice(batchSize, 2*batchSize, 0);
     ipu_utils::logger()->debug("Twiddle coeff shape: {} and multiply shape: {}", w.shape(), result_odd.shape());
 
-    // Copy the DFT results to a linear layout if there are enough
-    // elements for this to make sense (this heuristic is very approximate):
-    if (result_even.real.numElements() > graph.getTarget().getNumTiles()) {
-      ipu_utils::logger()->debug("Re-mapping DFT result ({} > {}).",
-                                 result_even.real.numElements(), graph.getTarget().getNumTiles());
-      auto result_even_remapped = ComplexTensor(graph, result_even.elementType(), result_even.shape(), "dft_even_remapped");
-      result_even_remapped.mapLinearly(graph);
-      prog.add(copy(result_even, result_even_remapped));
-      result_even = result_even_remapped;
-
-      auto result_odd_remapped = ComplexTensor(graph, result_even.elementType(), result_even.shape(), "dft_even_remapped");
-      result_odd_remapped.mapLinearly(graph);
-      prog.add(copy(result_odd, result_odd_remapped));
-      result_odd = result_odd_remapped;
-    }
+    // Copy the DFT results to a linear layout:
+    auto result_even_remapped = ComplexTensor(graph, result_even.elementType(), result_even.shape(), "dft_even_remapped");
+    result_even_remapped.mapLinearly(graph);
+    prog.add(copy(result_even, result_even_remapped));
+    result_even = result_even_remapped;
+
+    auto result_odd_remapped = ComplexTensor(graph, result_even.elementType(), result_even.shape(), "dft_even_remapped");
+    result_odd_remapped.mapLinearly(graph);
+    prog.add(copy(result_odd, result_odd_remapped));
+    result_odd = result_odd_remapped;
 
     // Element-wise multiply odd components by coefficients:
-    result_odd.multiplyInPlace(graph, w, prog, "twiddle");
-    auto tmp = result_odd;
+    auto tmp = multiply(graph, w, result_odd, prog, "twiddle");
+    //result_odd.multiplyInPlace(graph, w, prog, "twiddle");
+    //auto tmp = result_odd;
     // FLOP estimate for complex multiply:
     flopEstimate += 6 * tmp.real.numElements();
 
@@ -266,7 +254,7 @@ namespace complex {
 
   ComplexTensor FFTBuilder::inverseFourierMatrices(
       std::size_t length, poplar::Type elemType) {
-    const double twoPi_over_length = (2.0L / length) * 3.141592653589793238462643383279502884L;
+    const float twoPi_over_length = (2.0 / length) * 3.14159265358979323846;
     std::vector<float> real(length * length, 0.f);
     std::vector<float> imag(length * length, 0.f);
     for (std::size_t row = 0; row < length; ++row) {
@@ -292,7 +280,7 @@ namespace complex {
       throw std::logic_error("FFT size must be a multiple of 2.");
     }
     auto baseSize = N / 2;
-    const double s = ((2.0L * (N-1)) / N) * 3.141592653589793238462643383279502884L;
+    const float s = ((2.0 * (N-1)) / N) * 3.14159265358979323846;
     std::vector<float> real(baseSize, 0.f);
     std::vector<float> imag(baseSize, 0.f);
 

diff --git a/src/fft/complex.hpp b/src/fft/complex.hpp
@@ -105,29 +105,23 @@ class FFTBuilder {
   FFTBuilder(poplar::Graph &graph,
              poplar::program::Sequence &sequence,
              const std::string debugName)
-    : graph(graph), prog(sequence), debugPrefix(debugName),
-      availableMemoryProportion(-1.f), flopEstimate(0) {}
-
-  /// Set the proportion of memory available for the inner DFT matrix-multiplies.
-  void setAvailableMemoryProportion(float proportion) { availableMemoryProportion = proportion; }
+    : graph(graph), prog(sequence), debugPrefix(debugName), flopEstimate(0) {}
 
   /// Build the compute graph that applies FFT to the given complex vector.
   /// The program will be appended to the sequence specified in construction
   /// of this object.
   ComplexTensor fft1d(ComplexTensor input, std::size_t radix = 0);
-
+  poplar::program::Sequence& getProgram() { return prog; }
   std::size_t getFlopEstimate() const { return flopEstimate; }
 
 private:
-  float availableMemoryProportion;
-  std::size_t flopEstimate;
-
   // Utility functions used in construction of the FFT graph program.
   ComplexTensor multiplyMatrixByVectorBatch(const ComplexTensor matrix, ComplexTensor vectors);
   ComplexTensor dft1d(ComplexTensor fourierMatrix, ComplexTensor even, ComplexTensor odd);
   std::pair<ComplexTensor, ComplexTensor> splitEvenOdd(ComplexTensor input);
   ComplexTensor inverseFourierMatrices(std::size_t length, poplar::Type elemType);
   ComplexTensor twiddleCoefficients(std::size_t N, poplar::Type elemType);
+  std::size_t flopEstimate;
 };
 
 } // namespace complex
diff --git a/src/tools/FourierTransform.hpp b/src/tools/FourierTransform.hpp
@@ -43,7 +43,6 @@ struct FourierTransform :
     input.mapLinearly(graph);
 
     ipu_utils::logger()->info("Building FFT of input-size {} batch-size {} radix-size {}", size, batchSize, radixSize);
-    builder.setAvailableMemoryProportion(availableMemoryProportion);
     auto output = builder.fft1d(input, radixSize);
     ipu_utils::logger()->info("FFT estimated FLOP count: {}", builder.getFlopEstimate());
 
@@ -106,9 +105,7 @@ struct FourierTransform :
      "Batch size for 1D FFT (i.e. number of input vectors).")
     ("radix-size", po::value<std::size_t>(&radixSize)->default_value(0),
      "Choose radix size (base case size at which DFT matrix-multiply is performed). The default (0) automatically "
-     "sets the radix to half the input size (i.e. no FFT recursion).")
-    ("available-memory-proportion", po::value<float>(&availableMemoryProportion)->default_value(-1.f),
-     "Set the memory proportion available for the inner DFT matrix multiplies. Default: use the Poplar default.");
+     "sets the radix to half the input size (i.e. no FFT recursion).");
   }
 
   void init(const boost::program_options::variables_map& args) override {
@@ -129,7 +126,6 @@ struct FourierTransform :
   std::size_t size;
   std::size_t batchSize;
   std::size_t radixSize;
-  float availableMemoryProportion;
   std::vector<float> realData;
   std::vector<float> imagData;
 };