Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 108 additions & 55 deletions image processing timing/MemoryMapWriteBurst.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@
* Writes MANY frames into ONE container file using memory-mapped I/O
* with <= 2 GiB windows. Each frame uses a single bulk put(...).
*
* Stability strategy (per TRIAL, using a fresh file each time):
* 1) Prefault (READ_ONLY page-touch across the whole container; no dirty pages)
* 2) Short CPU warm-up to stabilize clocks/JIT
* 3) Timed BURN-IN (printed, not summarized), then force()+sleep to drain
* 4) Measured trial (copy/end-to-end/remap), then force()+sleep
*
* This isolates trials from each other and prevents steady writeback backlog
* growth across trials, which was inflating later measurements.
* Updated methodology:
* 1) Prefault (optional): READ_ONLY page-touch across the whole container (no dirty pages)
* 2) Warmup (NEW): mapping-only warmup across all windows (READ_WRITE map/unmap, no writes)
* 3) Warmup (NEW, optional): minimal dirty warmup (write 1 page per window; e.g., 4096 bytes)
* 4) CPU warmup spin
* 5) SINGLE measured burst write per trial (no full-size burn-in write)
* 6) force()+optional sleep after the measured burst
*
* Usage:
* java MemoryMapWriteBurst [width height frameCount [outDir] [bytesPerPixel] [pattern] [trials]]
Expand All @@ -30,7 +29,7 @@
* outDir output directory (default /tmp/aps)
* bytesPerPixel 1=8-bit, 2=16-bit, 4=32-bit (default 2)
* pattern { zero | random | ramp | alt } (default random)
* trials number of timed trials (default 1)
* trials number of timed trials/files to generate (default 1)
*/
public class MemoryMapWriteBurst {
// Defaults
Expand All @@ -41,13 +40,19 @@ public class MemoryMapWriteBurst {
static final long WINDOW_BYTES = 2_000_000_000L;

// Per-trial isolation & pacing (no CLI)
private static final boolean ROTATE_PATHS = true; // create a fresh file per trial
private static final boolean ENABLE_PREFAULT = true; // read-only page-touch
private static final int CPU_WARMUP_MS = 150; // small spin to stabilize clocks/JIT
private static final boolean FORCE_FLUSH_AFTER_BURNIN = false; // drain before measured copy
private static final int SLEEP_AFTER_BURNIN_MS = 0; // give writeback time to settle
private static final boolean FORCE_FLUSH_AFTER_TRIAL = false; // drain after measured copy
private static final int SLEEP_AFTER_TRIAL_MS = 0; // brief breather
private static final boolean ROTATE_PATHS = true; // create a fresh file per trial

// Optional stabilization
private static final boolean ENABLE_PREFAULT = false; // read-only page-touch (no dirty pages)
private static final boolean ENABLE_MAP_WARMUP = true; // NEW: map/unmap each window READ_WRITE (no write)
private static final boolean ENABLE_DIRTY_WARMUP = true; // NEW: dirty only a small amount per window
private static final int DIRTY_WARMUP_BYTES = 4096; // NEW: dirty 1 page per window (<= len)

private static final int CPU_WARMUP_MS = 150; // small spin to stabilize clocks/JIT

// Post-trial drain (keep visible to diagnose writeback pressure)
private static final boolean FORCE_FLUSH_AFTER_TRIAL = false;
private static final int SLEEP_AFTER_TRIAL_MS = 0;

public static void main(String[] args) throws IOException {
if (args.length == 1 || args.length == 2 || args.length > 7) {
Expand Down Expand Up @@ -82,12 +87,15 @@ public static void main(String[] args) throws IOException {

System.out.printf("Burst -> %dx%d @ %d B/px (%s), %d frames, container %,d bytes%n",
width, height, bpp, pattern, frames, containerBytes);
System.out.printf("Warmup: prefault=%s, mapWarmup=%s, dirtyWarmup=%s (%d bytes/window), cpuWarmupMs=%d%n",
ENABLE_PREFAULT, ENABLE_MAP_WARMUP, ENABLE_DIRTY_WARMUP, DIRTY_WARMUP_BYTES, CPU_WARMUP_MS);

// Per-trial totals
List<Double> copyMsList = new ArrayList<>(trials);
List<Double> end2endMsList = new ArrayList<>(trials);
List<Double> remapMsList = new ArrayList<>(trials);
List<Integer> remapsList = new ArrayList<>(trials);
List<Double> forceMsList = new ArrayList<>(trials);

for (int t = 1; t <= trials; t++) {
Path path = outDir.resolve(String.format(
Expand All @@ -104,59 +112,105 @@ public static void main(String[] args) throws IOException {

// -------------------- Prefault (READ_ONLY page-touch) --------------------
if (ENABLE_PREFAULT) {
final int page = 4096;
for (long start = 0; start < containerBytes; start += WINDOW_BYTES) {
int len = (int) Math.min(WINDOW_BYTES, containerBytes - start);
MappedByteBuffer ro = ch.map(FileChannel.MapMode.READ_ONLY, start, len);
for (int p = 0; p < len; p += page) {
byte b = ro.get(p);
if ((b & 1) == 2) System.out.print(""); // keep JIT from eliding
}
}
prefaultReadOnly(ch, containerBytes);
}

// Short CPU warm-up
cpuWarmupMillis(CPU_WARMUP_MS);
// -------------------- Warmup: mapping-only across windows (no writes) --------------------
if (ENABLE_MAP_WARMUP) {
warmupMappingsOnly(ch, containerBytes);
}

// -------------------- Timed BURN-IN (printed, not summarized) --------------------
TrialResult burn = runOneTimedTrial(ch, src, frameBytes, containerBytes);
System.out.printf("Trial %d burn-in: copy=%.3f ms | end-to-end=%.3f ms | remap=%.3f ms | remaps=%d -> %s%n",
t, burn.copyMs, burn.endToEndMs, burn.remapMs, burn.remaps, path.toAbsolutePath());
// -------------------- Warmup: minimal dirty write per window --------------------
if (ENABLE_DIRTY_WARMUP) {
warmupDirtyOnePagePerWindow(ch, containerBytes, DIRTY_WARMUP_BYTES);
}

// Drain before measured trial
double burnFlushMs = forceAndTime(ch);
if (SLEEP_AFTER_BURNIN_MS > 0) sleepMs(SLEEP_AFTER_BURNIN_MS);
// Short CPU warm-up
cpuWarmupMillis(CPU_WARMUP_MS);

// -------------------- Measured trial --------------------
// -------------------- SINGLE measured burst --------------------
TrialResult r = runOneTimedTrial(ch, src, frameBytes, containerBytes);

copyMsList.add(r.copyMs);
end2endMsList.add(r.endToEndMs);
remapMsList.add(r.remapMs);
remapsList.add(r.remaps);

System.out.printf("Trial %d/%d: copy=%.3f ms | end-to-end=%.3f ms | remap=%.3f ms | remaps=%d | force()=%.3f ms%n",
t, trials, r.copyMs, r.endToEndMs, r.remapMs, r.remaps, burnFlushMs);
// Drain after measured trial (optional) + always record force() time for diagnostics
double forceMs = forceAndTime(ch);
forceMsList.add(forceMs);

// Drain after measured trial
double flushMs = forceAndTime(ch);
System.out.printf("Trial %d/%d: copy=%.3f ms | end-to-end=%.3f ms | remap=%.3f ms | remaps=%d | force()=%.3f ms -> %s%n",
t, trials, r.copyMs, r.endToEndMs, r.remapMs, r.remaps, forceMs, path.toAbsolutePath());

if (FORCE_FLUSH_AFTER_TRIAL) {
// force() already called above; this flag is preserved for legacy toggling semantics
}
if (SLEEP_AFTER_TRIAL_MS > 0) sleepMs(SLEEP_AFTER_TRIAL_MS);
// Print flush time so you can see whether writeback is the culprit
System.out.printf("Trial %d post-flush: force()=%.3f ms%n", t, flushMs);
}
}

// Summaries (after all trials/files)
summarize("copy ms", copyMsList);
summarize("end-to-end", end2endMsList);
summarize("remap ms", remapMsList);
summarize("force ms", forceMsList);

int[] remArr = remapsList.stream().mapToInt(Integer::intValue).toArray();
System.out.println("Remaps (count): min=" + Arrays.stream(remArr).min().orElse(0) +
" max=" + Arrays.stream(remArr).max().orElse(0) +
" mean=" + String.format("%.2f", mean(remArr)));
}

// -------------------- Core timed trial (FIXED windowing logic) --------------------
// -------------------- Prefault (READ_ONLY) --------------------
private static void prefaultReadOnly(FileChannel ch, long containerBytes) throws IOException {
final int page = 4096;
for (long start = 0; start < containerBytes; start += WINDOW_BYTES) {
int len = (int) Math.min(WINDOW_BYTES, containerBytes - start);
MappedByteBuffer ro = ch.map(FileChannel.MapMode.READ_ONLY, start, len);
for (int p = 0; p < len; p += page) {
byte b = ro.get(p);
if ((b & 1) == 2) System.out.print(""); // keep JIT from eliding
}
if ((len % page) != 0) {
byte b = ro.get(len - 1);
if ((b & 1) == 2) System.out.print("");
}
}
}

// -------------------- Warmup: map/unmap windows (READ_WRITE, no writes) --------------------
private static void warmupMappingsOnly(FileChannel ch, long containerBytes) throws IOException {
for (long start = 0; start < containerBytes; start += WINDOW_BYTES) {
int len = (int) Math.min(WINDOW_BYTES, containerBytes - start);
MappedByteBuffer map = ch.map(FileChannel.MapMode.READ_WRITE, start, len);
// Touch a couple of positions via reads only (still non-dirty) to avoid complete elision.
// Reading from READ_WRITE mapping does not dirty pages.
byte b0 = map.get(0);
byte b1 = map.get(len - 1);
if (((b0 ^ b1) & 1) == 2) System.out.print("");
// Let map go out of scope; GC/unmap timing is OS-dependent, but warmup still exercises mapping path.
}
}

// -------------------- Warmup: dirty only one small page per window --------------------
private static void warmupDirtyOnePagePerWindow(FileChannel ch, long containerBytes, int dirtyBytes) throws IOException {
if (dirtyBytes <= 0) return;

byte[] tiny = new byte[Math.min(dirtyBytes, 4096)];
// Deterministic, non-zero pattern
for (int i = 0; i < tiny.length; i++) tiny[i] = (byte) (i * 31 + 7);

for (long start = 0; start < containerBytes; start += WINDOW_BYTES) {
int len = (int) Math.min(WINDOW_BYTES, containerBytes - start);
MappedByteBuffer map = ch.map(FileChannel.MapMode.READ_WRITE, start, len);
int n = Math.min(tiny.length, len);
map.position(0);
map.put(tiny, 0, n); // dirties only first page(s) of each window
}
}

// -------------------- Core timed trial (fixed windowing logic) --------------------
private static TrialResult runOneTimedTrial(FileChannel ch, byte[] src,
int frameBytes, long containerBytes) throws IOException {
long copyNsTotal = 0L;
Expand All @@ -165,34 +219,29 @@ private static TrialResult runOneTimedTrial(FileChannel ch, byte[] src,

int remaps = 0;
final int totalFrames = (int) (containerBytes / frameBytes);

MappedByteBuffer map = null;
long currentWindowStart = -1;
int currentWindowLen = 0;

for (int frameIndex = 0; frameIndex < totalFrames; frameIndex++) {
long frameOffset = (long) frameIndex * frameBytes;
long frameEnd = frameOffset + frameBytes;

// Check if current frame fits in the current window
boolean needsRemap = (map == null) ||
(frameOffset < currentWindowStart) ||

boolean needsRemap = (map == null) ||
(frameOffset < currentWindowStart) ||
(frameEnd > currentWindowStart + currentWindowLen);

if (needsRemap) {
// Start new window at this frame's offset
long windowStart = frameOffset;

// Window extends as far as possible (up to WINDOW_BYTES) but must stay within container
long maxWindowEnd = Math.min(windowStart + WINDOW_BYTES, containerBytes);
int windowLen = (int) (maxWindowEnd - windowStart);

// Create new mapping

long t0 = System.nanoTime();
map = ch.map(FileChannel.MapMode.READ_WRITE, windowStart, windowLen);
map.position(0);
map.position(0);
long t1 = System.nanoTime();

currentWindowStart = windowStart;
currentWindowLen = windowLen;
remaps++;
Expand Down Expand Up @@ -257,6 +306,10 @@ private static void sleepMs(int ms) {

// -------------------- Stats helpers --------------------
private static void summarize(String label, List<Double> vals) {
if (vals.isEmpty()) {
System.out.printf("Summary (%s): no data%n", label);
return;
}
double[] a = vals.stream().mapToDouble(Double::doubleValue).toArray();
Arrays.sort(a);
double mn = a[0], mx = a[a.length - 1];
Expand Down
137 changes: 137 additions & 0 deletions image processing timing/run_burst_stats.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/env bash
set -euo pipefail

# ---- Config (edit as needed) ----
JAVA_CMD=${JAVA_CMD:-java} # e.g. "java" or "/path/to/java"
CLASS=${CLASS:-MemoryMapWriteBurst} # class name (no .java)
WIDTH=${WIDTH:-2048}
HEIGHT=${HEIGHT:-2048}
FRAMES=${FRAMES:-1000}
OUTDIR=${OUTDIR:-/tmp/aps}
BPP=${BPP:-2}
PATTERN=${PATTERN:-random}
N=${N:-10} # number of separate-process runs

# Pause between runs to let background writeback settle (set 0 for stress)
SLEEP_BETWEEN=${SLEEP_BETWEEN:-3}

mkdir -p "$OUTDIR"
TS=$(date +"%Y%m%d_%H%M%S")
CSV="$OUTDIR/burst_stats_${WIDTH}x${HEIGHT}_${BPP}bpp_${FRAMES}frames_${TS}.csv"
LOG="$OUTDIR/burst_stats_${WIDTH}x${HEIGHT}_${BPP}bpp_${FRAMES}frames_${TS}.log"

# Container size (bytes) = width * height * bpp * frames
# Use awk for 64-bit safety on macOS bash
CONTAINER_BYTES=$(awk -v w="$WIDTH" -v h="$HEIGHT" -v b="$BPP" -v f="$FRAMES" 'BEGIN{printf "%.0f", w*h*b*f}')
CONTAINER_MB=$(awk -v cb="$CONTAINER_BYTES" 'BEGIN{printf "%.3f", cb/1e6}')

echo "Container: ${CONTAINER_BYTES} bytes (${CONTAINER_MB} MB)"
echo "Writing CSV to: $CSV"
echo "Full logs to: $LOG"
echo

echo "run,copy_ms,end2end_ms,remap_ms,remaps,force_ms,throughput_MBps,throughput_durable_MBps,path" > "$CSV"

for i in $(seq 1 "$N"); do
echo "=== Run $i/$N ===" | tee -a "$LOG"

# NOTE: assumes you've already compiled and are running from the directory containing .class
OUT="$($JAVA_CMD -cp . "$CLASS" "$WIDTH" "$HEIGHT" "$FRAMES" "$OUTDIR" "$BPP" "$PATTERN" 1 2>&1 | tee -a "$LOG")"

LINE=$(echo "$OUT" | grep -E "Trial 1/1:" | tail -n 1)

if [[ -z "$LINE" ]]; then
echo "ERROR: Could not parse Trial line in run $i. See log: $LOG" | tee -a "$LOG"
exit 1
fi

copy_ms=$(echo "$LINE" | sed -E 's/.*copy=([0-9.]+) ms.*/\1/')
end2end_ms=$(echo "$LINE" | sed -E 's/.*end-to-end=([0-9.]+) ms.*/\1/')
remap_ms=$(echo "$LINE" | sed -E 's/.*remap=([0-9.]+) ms.*/\1/')
remaps=$(echo "$LINE" | sed -E 's/.*remaps=([0-9]+).*/\1/')
force_ms=$(echo "$LINE" | sed -E 's/.*force\(\)=([0-9.]+) ms.*/\1/')
path=$(echo "$LINE" | sed -E 's/.*-> (.*)$/\1/')

# Buffered throughput (MB/s) = containerBytes / (end2endSeconds) / 1e6
throughput=$(awk -v cb="$CONTAINER_BYTES" -v ms="$end2end_ms" \
'BEGIN{ if(ms<=0){print "nan"} else { printf "%.3f", (cb/(ms/1000.0))/1e6 } }')

# Durable throughput (MB/s) = containerBytes / ((end2end+force)Seconds) / 1e6
throughput_durable=$(awk -v cb="$CONTAINER_BYTES" -v e="$end2end_ms" -v f="$force_ms" \
'BEGIN{ t=(e+f)/1000.0; if(t<=0){print "nan"} else { printf "%.3f", (cb/t)/1e6 } }')

echo "$i,$copy_ms,$end2end_ms,$remap_ms,$remaps,$force_ms,$throughput,$throughput_durable,\"$path\"" >> "$CSV"

if [[ "$SLEEP_BETWEEN" != "0" ]]; then
sleep "$SLEEP_BETWEEN"
fi
done

echo
echo "Done."
echo "CSV: $CSV"
echo "LOG: $LOG"
echo

# ---- Stats printer (awk / macOS compatible) ----
# Prints: n, min, max, mean, std, p50, p95
stats() {
local col="$1"
local label="$2"

awk -F, -v COL="$col" -v LABEL="$label" '
# Percentile helper (linear interpolation). Expects sorted array a[1..n].
function pct(p, rank, lo, hi, w){
if(n==1) return a[1]
rank = (p/100.0) * (n-1) + 1
lo = int(rank)
hi = lo + 1
if(hi>n) return a[n]
w = rank - lo
return a[lo]*(1-w) + a[hi]*w
}

NR==1 { next } # skip header

{
v = $COL + 0
n++
a[n] = v
sum += v
sumsq += v*v
if(n==1 || v < min) min = v
if(n==1 || v > max) max = v
}

END {
if(n==0){ printf "%s: no data\n", LABEL; exit }

mean = sum / n
var = (sumsq / n) - (mean*mean)
if(var < 0) var = 0
std = sqrt(var)

# sort a[1..n] (simple O(n^2), fine for small N)
for(i=1;i<=n;i++){
for(j=i+1;j<=n;j++){
if(a[j] < a[i]){
t=a[i]; a[i]=a[j]; a[j]=t
}
}
}

p50 = pct(50)
p95 = pct(95)

printf "%s: n=%d | min=%.3f | max=%.3f | mean=%.3f | std=%.3f | p50=%.3f | p95=%.3f\n",
LABEL, n, min, max, mean, std, p50, p95
}
' "$CSV"
}

echo "=== Summary statistics (${WIDTH}x${HEIGHT}, ${BPP} B/px, ${FRAMES} frames; ${CONTAINER_MB} MB) ==="
stats 3 "end2end_ms"
stats 2 "copy_ms"
stats 6 "force_ms"
stats 7 "throughput_MBps"
stats 8 "throughput_durable_MBps"