RawSpeed
fast raw decoding library
Loading...
Searching...
No Matches
CoalescingOutputIteratorBenchmark.cpp
Go to the documentation of this file.
1/*
2 RawSpeed - RAW file decoder.
3
4 Copyright (C) 2024 Roman Lebedev
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with this library; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19*/
20
22#include "adt/Array1DRef.h"
23#include "adt/Casts.h"
25#include "bench/Common.h"
26#include "common/Common.h"
27#include <cassert>
28#include <cstddef>
29#include <cstdint>
30#include <memory>
31#include <type_traits>
32#include <vector>
33#include <benchmark/benchmark.h>
34
35namespace rawspeed {
36
37namespace {
38
39template <bool b, typename T> struct C {
40 using coalescing = std::bool_constant<b>;
41 using value_type = T;
42};
44template <typename T> using CoalesceTo = C<true, T>;
45
46template <bool ShouldCoalesce, typename UnderlyingOutputIterator>
47 requires(!ShouldCoalesce)
48auto getMaybeCoalescingOutputIterator(UnderlyingOutputIterator e) {
49 return e;
50}
51
52template <bool ShouldCoalesce, typename UnderlyingOutputIterator>
53 requires(ShouldCoalesce)
54auto getMaybeCoalescingOutputIterator(UnderlyingOutputIterator e) {
56}
57
58template <typename C> void BM_Broadcast(benchmark::State& state) {
59 using T = typename C::value_type;
60
61 int64_t numBytes = state.range(0);
62 const int bytesPerChunk = sizeof(T);
63 const auto numChunks =
64 implicit_cast<int>(roundUpDivisionSafe(numBytes, bytesPerChunk));
65 numBytes = bytesPerChunk * numChunks;
66
67 std::vector<T, DefaultInitAllocatorAdaptor<T, std::allocator<T>>> output;
68 output.reserve(implicit_cast<size_t>(numChunks));
69
70 for (auto _ : state) {
71 output.clear();
72
74 std::back_inserter(output));
75 uint8_t value = 0;
76 for (int chunkIndex = 0; chunkIndex != numChunks; ++chunkIndex) {
77#if defined(__clang__)
78#pragma clang loop unroll(full)
79#endif
80 for (int byteOfChunk = 0; byteOfChunk != bytesPerChunk; ++byteOfChunk) {
81 benchmark::DoNotOptimize(value);
82 *iter = value;
83 }
84 }
85 }
86 assert(implicit_cast<int64_t>(output.size()) == numChunks);
87
88 state.SetComplexityN(numBytes);
89 state.counters.insert({
90 {"Throughput",
91 benchmark::Counter(sizeof(std::byte) * state.complexity_length_n(),
92 benchmark::Counter::Flags::kIsIterationInvariantRate,
93 benchmark::Counter::kIs1024)},
94 {"Latency",
95 benchmark::Counter(sizeof(std::byte) * state.complexity_length_n(),
96 benchmark::Counter::Flags::kIsIterationInvariantRate |
97 benchmark::Counter::Flags::kInvert,
98 benchmark::Counter::kIs1000)},
99 });
100}
101
102template <typename C> void BM_Copy(benchmark::State& state) {
103 using T = typename C::value_type;
104
105 int64_t numBytes = state.range(0);
106 const int bytesPerChunk = sizeof(T);
107 const auto numChunks =
108 implicit_cast<int>(roundUpDivisionSafe(numBytes, bytesPerChunk));
109 numBytes = bytesPerChunk * numChunks;
110
111 std::vector<uint8_t,
113 inputStorage(implicit_cast<size_t>(numBytes), uint8_t{0});
114 const auto input = Array1DRef(
115 inputStorage.data(), rawspeed::implicit_cast<int>(inputStorage.size()));
116 benchmark::DoNotOptimize(input.begin());
117
118 std::vector<T, DefaultInitAllocatorAdaptor<T, std::allocator<T>>> output;
119 output.reserve(implicit_cast<size_t>(numChunks));
120
121 for (auto _ : state) {
122 output.clear();
123
125 std::back_inserter(output));
126 for (int chunkIndex = 0; chunkIndex != numChunks; ++chunkIndex) {
127 const auto chunk =
128 input.getBlock(bytesPerChunk, chunkIndex).getAsArray1DRef();
129#if defined(__clang__)
130#pragma clang loop unroll(full)
131#endif
132 for (int byteOfChunk = 0; byteOfChunk != bytesPerChunk; ++byteOfChunk) {
133 *iter = chunk(byteOfChunk);
134 }
135 }
136 }
137 assert(implicit_cast<int64_t>(output.size()) == numChunks);
138
139 state.SetComplexityN(numBytes);
140 state.counters.insert({
141 {"Throughput",
142 benchmark::Counter(sizeof(std::byte) * state.complexity_length_n(),
143 benchmark::Counter::Flags::kIsIterationInvariantRate,
144 benchmark::Counter::kIs1024)},
145 {"Latency",
146 benchmark::Counter(sizeof(std::byte) * state.complexity_length_n(),
147 benchmark::Counter::Flags::kIsIterationInvariantRate |
148 benchmark::Counter::Flags::kInvert,
149 benchmark::Counter::kIs1000)},
150 });
151}
152
153void CustomArguments(benchmark::internal::Benchmark* b) {
154 b->Unit(benchmark::kMicrosecond);
155
156 static constexpr int L1dByteSize = 32U * (1U << 10U);
157 static constexpr int L2dByteSize = 512U * (1U << 10U);
158 static constexpr int MaxBytesOptimal = L2dByteSize * (1U << 5);
159
160 if (benchmarkDryRun()) {
161 b->Arg(L1dByteSize);
162 return;
163 }
164
165 b->RangeMultiplier(2);
166 if constexpr ((true))
167 b->Arg(MaxBytesOptimal);
168 else
169 b->Range(1, 2048UL << 20)->Complexity(benchmark::oN);
170}
171
172// NOLINTBEGIN(cppcoreguidelines-macro-usage)
173
174// NOLINTNEXTLINE(bugprone-macro-parentheses)
175#define GEN(I, C) BENCHMARK(I<C>)->Apply(CustomArguments)
176
177#define GEN_T(I) \
178 GEN(I, NoCoalescing); \
179 GEN(I, CoalesceTo<uint16_t>); \
180 GEN(I, CoalesceTo<uint32_t>); \
181 GEN(I, CoalesceTo<uint64_t>)
182
185
186// NOLINTEND(cppcoreguidelines-macro-usage)
187
188} // namespace
189
190} // namespace rawspeed
191
BENCHMARK_MAIN()
assert(dim.area() >=area)
bool RAWSPEED_READNONE benchmarkDryRun()
constexpr uint64_t RAWSPEED_READNONE roundUpDivisionSafe(uint64_t value, uint64_t div)
Definition Common.h:145
constexpr RAWSPEED_READNONE Ttgt implicit_cast(Tsrc value)
Definition Casts.h:32
throw T(buf.data())
CoalescingOutputIterator(T) -> CoalescingOutputIterator< T >
Array1DRef(T *data_, int numElts_) -> Array1DRef< T >