List iteration benchmark (#10888)

dougqh · sarahchen6 · devflow.devflow-routing-intake · web-flow · commit 16a52978f242 · 2026-06-24T18:05:53.000Z
List iteration benchmark Merge branch 'master' into dougqh/list-iteration-benchmark Fixing silly oversight - commented out benchmarks to run one in isolation earlier A bit of clean-up Adding missing end ul to doc Moving iterator benchmarks next to enhancedFor Merge branch 'master' into dougqh/list-iteration-benchmark spotless Merge branch 'dougqh/list-iteration-benchmark' of github.com:DataDog/dd-trace-java into dougqh/list-iteration-benchmark Update internal-api/src/jmh/java/datadog/trace/util/ListIterationBenchmark.java Co-authored-by: Sarah Chen <sarah.chen@datadoghq.com> Update internal-api/src/jmh/java/datadog/trace/util/ListIterationBenchmark.java Co-authored-by: Sarah Chen <sarah.chen@datadoghq.com> Merge branch 'master' into dougqh/list-iteration-benchmark Update internal-api/src/jmh/java/datadog/trace/util/ListIterationBenchmark.java Co-authored-by: Sarah Chen <sarah.chen@datadoghq.com> Isolate per-thread collections in ListIterationBenchmark Build each thread's list (and its Elements) in a Scope.Thread @setup so the manipulate_* mutations stay thread-local. Previously the lists lived in enum constants shared across all 8 threads, so the benchmark measured cross-thread contention on Element.num rather than iteration cost. Also bump to @fork(2) and fix a Javadoc typo. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Merge remote-tracking branch 'origin/master' into dougqh/list-iteration-benchmark Replace stale results in ListIterationBenchmark with Java 17 numbers Drop the old (pre-per-thread-state) results table; add a condensed Java 17 block. For ArrayList the direct styles (cstyleFor/forEach/enhanced-for/iterator) cluster within ~10%; stream() is ~3.6x slower; parallelStream() is catastrophic for small lists (ForkJoinPool overhead) and erratic. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Merge branch 'master' into dougqh/list-iteration-benchmark Co-authored-by: sarahchen6 <sarah.chen@datadoghq.com> Co-authored-by: devflow.devflow-routing-intake <devflow.devflow-routing-intake@kubernetes.us1.ddbuild.io>
diff --git a/internal-api/src/jmh/java/datadog/trace/util/ListIterationBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/ListIterationBenchmark.java
@@ -0,0 +1,204 @@
+package datadog.trace.util;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.function.Supplier;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.CompilerControl;
+import org.openjdk.jmh.annotations.CompilerControl.Mode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Benchmark comparing different ways to iterate list of different types and sizes -- both with
+ * simple loop bodies (inline case) and complicated loop bodies (dont inline case).
+ *
+ * <ul>
+ *   Compares...
+ *   <li>(RECOMMENDED) enhanced for loop / iterator - usually most performant, since escape analysis
+ *       usually eliminates iterator allocation
+ *   <li>(SITUATIONAL) List.forEach - good when using a non-capturing lambda, when escape analysis
+ *       fails to eliminate iterator allocation - good alternative
+ *   <li>(SITUATIONAL) c-style i=0; i < list.size() - usually worse than enhanced for - might be
+ *       useful with complicated loop body when escape analysis fails to eliminate the iterator
+ *   <li>(DISCOURAGED) List.stream - always incurs allocation overhead - usually unnecessary
+ *   <li>(DISCOURAGED) List.parallelStream - heavy allocation overhead - only beneficial when
+ *       working with sets (uncommon in the java agent)
+ *   <li>
+ * </ul>
+ *
+ * <p>Java 17 results (Apple M1, {@code @Fork(2)}, {@code @Threads(8)}; {@code ArrayList}, M ops/s =
+ * millions, shown at two sizes): <code>
+ * Iteration style       size 10   size 100
+ * cstyleFor               1050       165     (fastest)
+ * forEach                  995       163
+ * enhancedFor              945       153
+ * iterator                 935       148     (noisier run-to-run)
+ * streams                  158        45     (~3.6x slower; allocates)
+ * parallelStreams           ~1      ~0.3     (catastrophic at these sizes)
+ * </code>
+ *
+ * <p>Key findings:
+ *
+ * <ul>
+ *   <li>For {@code ArrayList}, the direct styles -- {@code cstyleFor}, {@code forEach},
+ *       enhanced-for, and explicit {@code iterator} -- cluster within ~10% of each other; escape
+ *       analysis eliminates the iterator allocation, so enhanced-for/iterator stay competitive
+ *       while reading cleanest (the RECOMMENDED choice).
+ *   <li>{@code stream()} is ~3.6x slower than direct iteration and allocates per call -- avoid on
+ *       hot paths.
+ *   <li>{@code parallelStream()} is catastrophic for small collections (hundreds of times slower):
+ *       ForkJoinPool split/coordinate overhead dwarfs the work, and it is run-to-run erratic. Never
+ *       use it for the small lists typical in the agent.
+ *   <li>{@code _inline} vs {@code _dont_inline} loop bodies barely differ at these sizes -- the
+ *       iteration mechanics dominate, not the body.
+ * </ul>
+ */
+@Fork(2)
+@Warmup(iterations = 2)
+@Measurement(iterations = 3)
+@Threads(8)
+@State(Scope.Thread)
+public class ListIterationBenchmark {
+  public static final class Element {
+    int num = 0;
+
+    @CompilerControl(Mode.INLINE)
+    void manipulate_inline() {
+      this.num += 1;
+    }
+
+    @CompilerControl(Mode.DONT_INLINE)
+    void manipulate_dont_inline() {
+      this.num += 1;
+    }
+  }
+
+  static ArrayList<Element> newArrayList(int size) {
+    ArrayList<Element> newList = new ArrayList<>(size);
+    for (int i = 0; i < size; ++i) {
+      newList.add(new Element());
+    }
+    return newList;
+  }
+
+  /**
+   * Describes the list under test as a factory rather than a prebuilt instance. Each benchmark
+   * thread builds its own list (with its own {@link Element}s) in {@link #setUp()}, so the {@code
+   * manipulate_*} mutations stay thread-local — otherwise, with {@code @Threads(8)} sharing one
+   * list held in an enum constant, the benchmark would measure cross-thread contention on {@code
+   * Element.num} rather than iteration cost.
+   */
+  public enum ListSpec {
+    COLLECTIONS_EMPTY_LIST(Collections::emptyList),
+    EMPTY_ARRAY_LIST(ArrayList::new),
+    SINGLETON_LIST(() -> Collections.singletonList(new Element())),
+    ARRAY_LIST_1(() -> newArrayList(1)),
+    ARRAY_LIST_5(() -> newArrayList(5)),
+    ARRAY_LIST_10(() -> newArrayList(10)),
+    ARRAY_LIST_100(() -> newArrayList(100));
+
+    private final Supplier<List<Element>> factory;
+
+    ListSpec(Supplier<List<Element>> factory) {
+      this.factory = factory;
+    }
+
+    List<Element> build() {
+      return factory.get();
+    }
+  }
+
+  @Param ListSpec listSpec;
+
+  List<Element> list;
+
+  @Setup(Level.Trial)
+  public void setUp() {
+    // Built per thread (the class is @State(Scope.Thread)) so each thread owns its own Elements.
+    this.list = this.listSpec.build();
+  }
+
+  @Benchmark
+  public void forEach_inline() {
+    this.list.forEach(Element::manipulate_inline);
+  }
+
+  @Benchmark
+  public void forEach_dont_inline() {
+    this.list.forEach(Element::manipulate_dont_inline);
+  }
+
+  @Benchmark
+  public void enhancedFor_inline() {
+    // Enhanced for-loop is just syntax sugar for an Iterator
+    for (Element e : this.list) {
+      e.manipulate_inline();
+    }
+  }
+
+  @Benchmark
+  public void enhancedFor_dont_inline() {
+    // Enhanced for-loop is just syntax sugar for an Iterator
+    for (Element e : this.list) {
+      e.manipulate_dont_inline();
+    }
+  }
+
+  @Benchmark
+  public void iterator_inline() {
+    for (Iterator<Element> iter = this.list.iterator(); iter.hasNext(); ) {
+      iter.next().manipulate_inline();
+    }
+  }
+
+  @Benchmark
+  public void iterator_dont_inline() {
+    for (Iterator<Element> iter = this.list.iterator(); iter.hasNext(); ) {
+      iter.next().manipulate_dont_inline();
+    }
+  }
+
+  @Benchmark
+  public void cstyleFor_inline() {
+    for (int i = 0; i < this.list.size(); ++i) {
+      this.list.get(i).manipulate_inline();
+    }
+  }
+
+  @Benchmark
+  public void cstyleFor_dont_inline() {
+    for (int i = 0; i < this.list.size(); ++i) {
+      this.list.get(i).manipulate_dont_inline();
+    }
+  }
+
+  @Benchmark
+  public void streams_inline() {
+    this.list.stream().forEach(Element::manipulate_inline);
+  }
+
+  @Benchmark
+  public void streams_dont_inline() {
+    this.list.stream().forEach(Element::manipulate_dont_inline);
+  }
+
+  @Benchmark
+  public void parallelStreams_inline() {
+    this.list.parallelStream().forEach(Element::manipulate_inline);
+  }
+
+  @Benchmark
+  public void parallelStreams_dont_inline() {
+    this.list.parallelStream().forEach(Element::manipulate_dont_inline);
+  }
+}