Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions python/fft/perf_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def getFFTInfoFromLog(log_file):
if args.csv_out:
print("Writing CSV headers")
with open(args.csv_out, "w") as f:
f.write(f"Input-size,Batch-size,Radix-size,DFT Batch-size,Cycles,FLOPS Estimate,FLOPS/Cycle,GFLOPS/sec,Memory Including Gaps (bytes),Peak Live Memory Step, Peak Live Memory (bytes)\n")
f.write(f"Input-size,Batch-size,Radix-size,DFT Batch-size,Cycles,FLOPS Estimate,FLOPS/Cycle,GFLOPS/sec,Memory Including Gaps (bytes)\n")
exit()
else:
raise RuntimeError("Can't write headers: no output file specified.")
Expand All @@ -83,9 +83,6 @@ def getFFTInfoFromLog(log_file):
total_memory = sum(tile.memory.total.includingGaps for tile in report.compilation.tiles)
print(f"Total memory use (bytes): {total_memory}")

peak_name, peak_live_memory = getPeakLivenessStep(report)
print(f"Program step consuming peak memory: {peak_name} {peak_live_memory}")

size, bs, radix, cycles, flops, dft_batch_size = getFFTInfoFromLog(args.log_file)
flops_per_cycle = flops/cycles if flops else None
gflops_per_second = flops_per_cycle * args.clock_speed_ghz if flops_per_cycle else None
Expand All @@ -101,4 +98,4 @@ def getFFTInfoFromLog(log_file):
# Collate everything into one line of CSV and append to file if specififed:
if args.csv_out:
with open(args.csv_out, "a") as f:
f.write(f"{size},{bs},{radix},{dft_batch_size},{cycles},{flops},{flops_per_cycle},{gflops_per_second},{total_memory},{peak_name},{peak_live_memory}\n")
f.write(f"{size},{bs},{radix},{dft_batch_size},{cycles},{flops},{flops_per_cycle},{gflops_per_second},{total_memory}\n")
35 changes: 35 additions & 0 deletions python/fft/sweep2d.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash

# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

RUN_DIR="2d_profiles"
mkdir -p $RUN_DIR
CSV_FILE=${RUN_DIR}/"2d_csv_results_2.txt"

for SIZE in 256 512 1024 2048
do
for BS in 1 4 8 16 32 64
do
for RADIX in 2 4 8 16 32 64 128
do
RUN_NAME="fft_2d_${SIZE}_bs${BS}_radix${RADIX}"
export POPLAR_ENGINE_OPTIONS="{\"autoReport.all\":\"true\", \"autoReport.directory\":\"${RUN_DIR}/${RUN_NAME}\", \"profiler.includeFlopEstimates\":\"true\"}"
echo "Running size: ${SIZE} batch-size: ${BS} radix: ${RADIX}"
mkdir -p ${RUN_DIR}/${RUN_NAME}
./multi-tool FourierTransform2D --fft-size ${SIZE} --batch-size ${BS} --radix-size ${RADIX} > ${RUN_DIR}/${RUN_NAME}/run_log.txt &
done
wait
done
done

# Overwrite the CSV file writing new headers:
python3 ../python/fft/perf_analysis.py --report-file fake --log-file fake --csv-out ${CSV_FILE} --csv-write-headers

# Append all run results to CSV file:
for DIR in $RUN_DIR/*
do
echo Processing path $DIR
LOG_FILE=$DIR/run_log.txt
REPORT_FILE=$DIR/ipu_utils_engine/profile.pop
python3 ../python/fft/perf_analysis.py --report-file ${REPORT_FILE} --log-file ${LOG_FILE} --csv-out ${CSV_FILE}
done
50 changes: 19 additions & 31 deletions src/fft/complex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,10 @@ namespace complex {
namespace pe = popops::expr;
auto complexMulExprRe = pe::Sub(pe::Mul(pe::_1, pe::_2), pe::Mul(pe::_3, pe::_4));
auto complexMulExprIm = pe::Add(pe::Mul(pe::_1, pe::_2), pe::Mul(pe::_3, pe::_4));

// Can only do the second expression in-place:
auto tmpReal = popops::map(graph, complexMulExprRe, {real, v.real, imag, v.imag},
prog, debugPrefix + "/complex_mul_re");
popops::mapInPlace(graph, complexMulExprRe, {real, v.real, imag, v.imag},
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to keep my change here. Only the second operation can be done in place because it needs the input from the first.

prog, debugPrefix + "/complex_mul_re");
popops::mapInPlace(graph, complexMulExprIm, {imag, v.real, real, v.imag},
prog, debugPrefix + "/complex_mul_im");
real = tmpReal;
}

ComplexTensor multiply(poplar::Graph& graph,
Expand Down Expand Up @@ -129,17 +126,12 @@ namespace complex {
graph.setTileMapping(matrix.real, graph.getTileMapping(matmulMapping));
graph.setTileMapping(matrix.imag, graph.getTileMapping(matmulMapping));

poplar::OptionFlags matmulOptions;
if (availableMemoryProportion > 0.f) {
matmulOptions.set("availableMemoryProportion", std::to_string(availableMemoryProportion));
}

poplar::Tensor partial =
poplin::matMul(graph, matrix.real, realBatch, prog,
elemType, debugStr + "/real_matmul", matmulOptions);
elemType, debugStr + "/real_matmul");
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to keep my change here (matmul options).


poplin::matMulAcc(graph, partial, 1.f, matrix.imag, imagBatch, prog,
debugStr + "/imag_matmul", matmulOptions);
debugStr + "/imag_matmul");
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep matmul options as above.


// FLOP estimates for matrix multiplies:
flopEstimate += 2 * matrix.dim(0) * matrix.dim(1) * realBatch.dim(1) * 2;
Expand Down Expand Up @@ -219,25 +211,21 @@ namespace complex {
auto result_odd = fftSubResult.transpose().slice(batchSize, 2*batchSize, 0);
ipu_utils::logger()->debug("Twiddle coeff shape: {} and multiply shape: {}", w.shape(), result_odd.shape());

// Copy the DFT results to a linear layout if there are enough
// elements for this to make sense (this heuristic is very approximate):
if (result_even.real.numElements() > graph.getTarget().getNumTiles()) {
ipu_utils::logger()->debug("Re-mapping DFT result ({} > {}).",
result_even.real.numElements(), graph.getTarget().getNumTiles());
auto result_even_remapped = ComplexTensor(graph, result_even.elementType(), result_even.shape(), "dft_even_remapped");
result_even_remapped.mapLinearly(graph);
prog.add(copy(result_even, result_even_remapped));
result_even = result_even_remapped;

auto result_odd_remapped = ComplexTensor(graph, result_even.elementType(), result_even.shape(), "dft_even_remapped");
result_odd_remapped.mapLinearly(graph);
prog.add(copy(result_odd, result_odd_remapped));
result_odd = result_odd_remapped;
}
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep my change here: it uses a better layout for small FFTs to improve their performance and memory use.

// Copy the DFT results to a linear layout:
auto result_even_remapped = ComplexTensor(graph, result_even.elementType(), result_even.shape(), "dft_even_remapped");
result_even_remapped.mapLinearly(graph);
prog.add(copy(result_even, result_even_remapped));
result_even = result_even_remapped;

auto result_odd_remapped = ComplexTensor(graph, result_even.elementType(), result_even.shape(), "dft_even_remapped");
result_odd_remapped.mapLinearly(graph);
prog.add(copy(result_odd, result_odd_remapped));
result_odd = result_odd_remapped;
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep this as it improves memory use and perf.


// Element-wise multiply odd components by coefficients:
result_odd.multiplyInPlace(graph, w, prog, "twiddle");
auto tmp = result_odd;
auto tmp = multiply(graph, w, result_odd, prog, "twiddle");
//result_odd.multiplyInPlace(graph, w, prog, "twiddle");
//auto tmp = result_odd;
// FLOP estimate for complex multiply:
flopEstimate += 6 * tmp.real.numElements();

Expand Down Expand Up @@ -266,7 +254,7 @@ namespace complex {

ComplexTensor FFTBuilder::inverseFourierMatrices(
std::size_t length, poplar::Type elemType) {
const double twoPi_over_length = (2.0L / length) * 3.141592653589793238462643383279502884L;
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep my change as it improves prexision significantly (same below).

const float twoPi_over_length = (2.0 / length) * 3.14159265358979323846;
std::vector<float> real(length * length, 0.f);
std::vector<float> imag(length * length, 0.f);
for (std::size_t row = 0; row < length; ++row) {
Expand All @@ -292,7 +280,7 @@ namespace complex {
throw std::logic_error("FFT size must be a multiple of 2.");
}
auto baseSize = N / 2;
const double s = ((2.0L * (N-1)) / N) * 3.141592653589793238462643383279502884L;
const float s = ((2.0 * (N-1)) / N) * 3.14159265358979323846;
std::vector<float> real(baseSize, 0.f);
std::vector<float> imag(baseSize, 0.f);

Expand Down
12 changes: 3 additions & 9 deletions src/fft/complex.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,29 +105,23 @@ class FFTBuilder {
FFTBuilder(poplar::Graph &graph,
poplar::program::Sequence &sequence,
const std::string debugName)
: graph(graph), prog(sequence), debugPrefix(debugName),
availableMemoryProportion(-1.f), flopEstimate(0) {}

/// Set the proportion of memory available for the inner DFT matrix-multiplies.
void setAvailableMemoryProportion(float proportion) { availableMemoryProportion = proportion; }
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep availableMem prop changes.

: graph(graph), prog(sequence), debugPrefix(debugName), flopEstimate(0) {}

/// Build the compute graph that applies FFT to the given complex vector.
/// The program will be appended to the sequence specified in construction
/// of this object.
ComplexTensor fft1d(ComplexTensor input, std::size_t radix = 0);

poplar::program::Sequence& getProgram() { return prog; }
std::size_t getFlopEstimate() const { return flopEstimate; }

private:
float availableMemoryProportion;
std::size_t flopEstimate;

// Utility functions used in construction of the FFT graph program.
ComplexTensor multiplyMatrixByVectorBatch(const ComplexTensor matrix, ComplexTensor vectors);
ComplexTensor dft1d(ComplexTensor fourierMatrix, ComplexTensor even, ComplexTensor odd);
std::pair<ComplexTensor, ComplexTensor> splitEvenOdd(ComplexTensor input);
ComplexTensor inverseFourierMatrices(std::size_t length, poplar::Type elemType);
ComplexTensor twiddleCoefficients(std::size_t N, poplar::Type elemType);
std::size_t flopEstimate;
};

} // namespace complex
6 changes: 1 addition & 5 deletions src/tools/FourierTransform.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ struct FourierTransform :
input.mapLinearly(graph);

ipu_utils::logger()->info("Building FFT of input-size {} batch-size {} radix-size {}", size, batchSize, radixSize);
builder.setAvailableMemoryProportion(availableMemoryProportion);
auto output = builder.fft1d(input, radixSize);
ipu_utils::logger()->info("FFT estimated FLOP count: {}", builder.getFlopEstimate());

Expand Down Expand Up @@ -106,9 +105,7 @@ struct FourierTransform :
"Batch size for 1D FFT (i.e. number of input vectors).")
("radix-size", po::value<std::size_t>(&radixSize)->default_value(0),
"Choose radix size (base case size at which DFT matrix-multiply is performed). The default (0) automatically "
"sets the radix to half the input size (i.e. no FFT recursion).")
("available-memory-proportion", po::value<float>(&availableMemoryProportion)->default_value(-1.f),
"Set the memory proportion available for the inner DFT matrix multiplies. Default: use the Poplar default.");
"sets the radix to half the input size (i.e. no FFT recursion).");
}

void init(const boost::program_options::variables_map& args) override {
Expand All @@ -129,7 +126,6 @@ struct FourierTransform :
std::size_t size;
std::size_t batchSize;
std::size_t radixSize;
float availableMemoryProportion;
std::vector<float> realData;
std::vector<float> imagData;
};
Loading