61 int64_t numBytes = state.range(0);
62 const int bytesPerChunk =
sizeof(
T);
63 const auto numChunks =
65 numBytes = bytesPerChunk * numChunks;
67 std::vector<T, DefaultInitAllocatorAdaptor<T, std::allocator<T>>> output;
70 for (
auto _ : state) {
74 std::back_inserter(output));
76 for (
int chunkIndex = 0; chunkIndex != numChunks; ++chunkIndex) {
78#pragma clang loop unroll(full)
80 for (
int byteOfChunk = 0; byteOfChunk != bytesPerChunk; ++byteOfChunk) {
81 benchmark::DoNotOptimize(value);
88 state.SetComplexityN(numBytes);
89 state.counters.insert({
91 benchmark::Counter(
sizeof(std::byte) * state.complexity_length_n(),
92 benchmark::Counter::Flags::kIsIterationInvariantRate,
93 benchmark::Counter::kIs1024)},
95 benchmark::Counter(
sizeof(std::byte) * state.complexity_length_n(),
96 benchmark::Counter::Flags::kIsIterationInvariantRate |
97 benchmark::Counter::Flags::kInvert,
98 benchmark::Counter::kIs1000)},
102template <
typename C>
void BM_Copy(benchmark::State& state) {
105 int64_t numBytes = state.range(0);
106 const int bytesPerChunk =
sizeof(
T);
107 const auto numChunks =
109 numBytes = bytesPerChunk * numChunks;
116 benchmark::DoNotOptimize(input.begin());
118 std::vector<T, DefaultInitAllocatorAdaptor<T, std::allocator<T>>> output;
121 for (
auto _ : state) {
125 std::back_inserter(output));
126 for (
int chunkIndex = 0; chunkIndex != numChunks; ++chunkIndex) {
128 input.getBlock(bytesPerChunk, chunkIndex).getAsArray1DRef();
129#if defined(__clang__)
130#pragma clang loop unroll(full)
132 for (
int byteOfChunk = 0; byteOfChunk != bytesPerChunk; ++byteOfChunk) {
133 *iter = chunk(byteOfChunk);
139 state.SetComplexityN(numBytes);
140 state.counters.insert({
142 benchmark::Counter(
sizeof(std::byte) * state.complexity_length_n(),
143 benchmark::Counter::Flags::kIsIterationInvariantRate,
144 benchmark::Counter::kIs1024)},
146 benchmark::Counter(
sizeof(std::byte) * state.complexity_length_n(),
147 benchmark::Counter::Flags::kIsIterationInvariantRate |
148 benchmark::Counter::Flags::kInvert,
149 benchmark::Counter::kIs1000)},
154 b->Unit(benchmark::kMicrosecond);
156 static constexpr int L1dByteSize = 32U * (1U << 10U);
157 static constexpr int L2dByteSize = 512U * (1U << 10U);
158 static constexpr int MaxBytesOptimal = L2dByteSize * (1U << 5);
165 b->RangeMultiplier(2);
166 if constexpr ((
true))
167 b->Arg(MaxBytesOptimal);
169 b->Range(1, 2048UL << 20)->Complexity(benchmark::oN);