SH4ZAM! 0.1.0
Fast math library for the Sega Dreamcast's SH4 CPU
Loading...
Searching...
No Matches
shz_mem.h
Go to the documentation of this file.
1/*! \file
2 * \brief Memory API
3 * \ingroup memory
4 *
5 * API built around copying, assigning, and working with memory.
6 *
7 * \todo
8 * - shz_macw()
9 * - shz_memset2()
10 * - shz_memset4()
11 * - shz_memset32()
12 * - shz_memset()
13 * - shz_memmove()
14 *
15 * \author 2025, 2026 Falco Girgis
16 * \author 2020 MoopTheHedgehog
17 *
18 * \copyright MIT License
19 */
20
21#ifndef SHZ_MEM_H
22#define SHZ_MEM_H
23
24#include "shz_cdefs.h"
25
26#include <stdbool.h>
27#include <stddef.h>
28
29/*! \defgroup memory Memory
30 \brief Routines for managing memory.
31
32 This API provides the following types of memory routines:
33 - barriers
34 - special instruction intrinsics
35 - cache operations
36 - memcpy()-type routines
37
38 \note
39 memcpy()-like routines will typically always check for
40 proper alignment and size increments of parameters using
41 assert(), so make sure to build a release build (-DNDEBUG)
42 for maximal gainz, when not debugging.
43 */
44
45/*! \name Barriers
46 \brief Macros for preventing GCC from reordering instructions.
47 @{
48*/
49
50//! Creates a software memory barrier beyond which any loads or stores may not be reordered
51#define SHZ_MEMORY_BARRIER_SOFT() asm volatile("" : : : "memory")
52//! Creates a hardware memory barrier beyond which any loads or stores may not be reordered
53#define SHZ_MEMORY_BARRIER_HARD() __sync_synchronize()
54
55//! @}
56
57SHZ_DECLS_BEGIN
58
59/*! Intrinsic around the SH4 `MOVCA.L` instruction.
60
61 Preallocates the cache-line containing \p src.
62
63 Zero-initializes all 32-bytes within the \p src cache-line,
64 setting the valid bit to `1`.
65*/
66SHZ_FORCE_INLINE void shz_dcache_alloc_line(void* src) SHZ_NOEXCEPT;
67
68/*! Generic drop-in fast memcpy() replacement.
69
70 Copies \p bytes from \p src to \p dst, determining the most efficient
71 specialization to call into at run-time, returning \p dst.
72
73 There are no alignment or size requirements for this routine.
74
75 \note
76 When you know of and can control the \p src and \p dst alignments and
77 batch sizes, you can micro-optimize by calling into the most specific
78 memcpy() specialization for your given scenario, over just using this
79 generic implementation, which must choose which one to use at run-time.
80
81 \warning
82 \p dst and \p src buffers should not be overlapping.
83
84 \sa shz_memcpy1(), shz_memcpy2(), shz_memcpy4(), shz_memcpy8(), shz_memcpy32(),
85 shz_memcpy64(), shz_memcpy128()
86*/
87SHZ_INLINE void* shz_memcpy( void* SHZ_RESTRICT dst,
88 const void* SHZ_RESTRICT src,
89 size_t bytes) SHZ_NOEXCEPT;
90
91/*! \name Specializations
92 \brief Specialized routines for specific sizes + alignments.
93 @{
94*/
95
96/*! Copies an unaligned buffer to another one byte at a time.
97
98 The \p dst pointer is returned.
99
100 \note
101 Typically, unless you know you are copying a tiny number of
102 definitely unaligned bytes, you want to use shz_memcpy(),
103 which automatically handles arbitrary alignment for you,
104 potentially more efficiently than copying byte-by-byte.
105
106 \warning
107 \p dst and \p src buffers should not be overlapping.
108
109 \sa shz_memcpy()
110*/
111SHZ_FORCE_INLINE void* shz_memcpy1( void* SHZ_RESTRICT dst,
112 const void* SHZ_RESTRICT src,
113 size_t bytes) SHZ_NOEXCEPT;
114
115/*! Copies from one 2-byte aligned buffer to another two bytes at a time.
116
117 The \p dst pointer is returned.
118
119 \warning
120 \p dst and \p src must both be aligned by at least 2 bytes, and \p bytes
121 must be a multiple of 2.
122
123 \warning
124 \p dst and \p src buffers should not be overlapping.
125*/
126SHZ_INLINE void* shz_memcpy2( void* SHZ_RESTRICT dst,
127 const void* SHZ_RESTRICT src,
128 size_t bytes) SHZ_NOEXCEPT;
129
130/*! Copies a from one 4-byte aligned buffer to another 4 bytes at a time.
131
132 The \p dst buffer is returned.
133
134 \warning
135 \p dst and \p src must both be aligned by at least 4 bytes, and
136 \p bytes must be a multiple of 4.
137
138 \warning
139 \p dst and \p src buffers should not be overlapping.
140*/
141SHZ_INLINE void* shz_memcpy4( void* SHZ_RESTRICT dst,
142 const void* SHZ_RESTRICT src,
143 size_t bytes) SHZ_NOEXCEPT;
144
145/*! Copies a from one 8-byte aligned buffer to another 8 bytes at a time.
146
147 The \p dst buffer is returned.
148
149 \warning
150 \p dst and \p src must both be aligned by at least 8 bytes, and
151 \p bytes must be a multiple of 8.
152
153 \warning
154 \p src and \p dst should not overlap.
155*/
156SHZ_INLINE void* shz_memcpy8( void* SHZ_RESTRICT dst,
157 const void* SHZ_RESTRICT src,
158 size_t bytes) SHZ_NOEXCEPT;
159
160/*! Assigns the given 8-byte \p value to the \p bytes in \p dst.
161
162 \warning
163 \p dst should be at least 8-byte aligned, and \p bytes should be
164 a multiple of 8!
165*/
166SHZ_INLINE void* shz_memset8(void* dst, uint64_t value, size_t bytes) SHZ_NOEXCEPT;
167
168/*! Copies \p bytes from the \p src to the \p dst buffer in 32-byte chunks.
169
170 Transfers from 8-byte aligned buffer, \p src to 32-byte aligned buffer, \p dst,
171 32 bytes at a time. Returns the \p dst address.
172
173 \warning
174 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned. \p bytes must
175 be a multiple of 32.
176
177 \warning
178 \p src and \p dst buffers must not overlap.
179
180 \note
181 This is the quickest way to move 32-byte chunks of data around *within memory*, but
182 the shz_sq_memcpy32() will be faster when writing through the cache to external memory.
183
184 \sa shz_sq_memcpy32()
185*/
186SHZ_INLINE void* shz_memcpy32( void* SHZ_RESTRICT dst,
187 const void* SHZ_RESTRICT src,
188 size_t bytes) SHZ_NOEXCEPT;
189
190/*! Copies \p bytes from \p src to \p dst in 32-byte chunks, using the Store Queues.
191
192 Transfers from 8-byte aligned buffer, \p src to 4-byte aligned address, \p dst,
193 32 bytes at a time, writing through the cache, using the SH4's Store Queues.
194 Returns the \p dst address.
195
196 \warning
197 \p src must be at least 8-byte aligned, while \p dst can be only 4-byte aligned.
198 \p bytes must be a multiple of 32.
199
200 \note
201 This is the quickest way to move 32-byte chunks of data to *external memory*.
202 When copying to cached memory, you must invalidate the cache lines containing
203 \p dst before initiating the copy... Which means this routine becomes slower
204 than doing memory-to-memory copies with shz_memcpy32().
205
206 \sa shz_memcpy32(), shz_sq_memcpy32_1()
207*/
208SHZ_INLINE void* shz_sq_memcpy32( void* SHZ_RESTRICT dst,
209 const void* SHZ_RESTRICT src,
210 size_t bytes) SHZ_NOEXCEPT;
211
212/*! Copies \p bytes from \p src to \p dst in 32-byte chunks, using the Store Queues and XMTRX.
213
214 Equiavalent to shz_sq_memcpy32(), except copying is done through XMTRX.
215
216 \warning
217 This routine clobbers XMTRX.
218*/
219SHZ_INLINE void* shz_sq_memcpy32_xmtrx( void* SHZ_RESTRICT dst,
220 const void* SHZ_RESTRICT src,
221 size_t bytes) SHZ_NOEXCEPT;
222
223/*! Specialized memcpy() variant for copying multiples of 64-bytes.
224
225 Copies a from an 8-byte aligned buffer to a 32-byte aligned buffer, 64 bytes at a time.
226 Returns the \p dst address.
227
228 \warning
229 \p src and \p dst buffers must not overlap.
230
231 \warning
232 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned. \p bytes must
233 be a multiple of 64.
234*/
235SHZ_INLINE void* shz_memcpy64( void* SHZ_RESTRICT dst,
236 const void* SHZ_RESTRICT src,
237 size_t bytes) SHZ_NOEXCEPT;
238
239/*! Specialized memcpy() variant for copying multiples of 128 bytes.
240
241 Copies a from an 8-byte aligned buffer to a 32-byte aligned buffer, 128 bytes at a time.
242 Returns the \p dst address.
243
244 \warning
245 \p src and \p dst buffers must not overlap.
246
247 \warning
248 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned. \p bytes must
249 be a multiple of 128.
250*/
251SHZ_INLINE void* shz_memcpy128( void* SHZ_RESTRICT dst,
252 const void* SHZ_RESTRICT src,
253 size_t bytes) SHZ_NOEXCEPT;
254
255//! @}
256
257/*! \name Constant-sized Operations
258 \brief Specialized routines for operating on statically sized buffers.
259 @{
260*/
261
262/*! Copies 16 shorts from \p src to \p dst.
263
264 \warning
265 \p src and \p dst buffers must not overlap.
266
267 \warning
268 \p dst and \p src must both be aligned by at least two bytes.
269*/
270SHZ_INLINE void shz_memcpy2_16( void* SHZ_RESTRICT dst,
271 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
272
273/*! Sets the values of the 16 shorts pointed to by \p dst to the given \p value.
274
275 \warning
276 \p dst must be aligned by at least two bytes.
277*/
278SHZ_INLINE void shz_memset2_16(void* dst, uint16_t value) SHZ_NOEXCEPT;
279
280/*! Copies 16 4-byte, long values from \p src to \p dst.
281
282 \warning
283 \p src and \p dst buffers must not overlap.
284
285 \warning
286 The \p src and \p dst buffers must both be at least 4-byte aligned.
287*/
288SHZ_INLINE void shz_memcpy4_16( void* SHZ_RESTRICT dst,
289 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
290
291/*! Copies 32 bytes from \p p1 to \p p2 as a single chunk.
292
293 \warning
294 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned.
295*/
296SHZ_INLINE void shz_memcpy32_1( void* SHZ_RESTRICT dst,
297 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
298
299/*! Swaps the values within the given 32-byte buffers.
300
301 \warning
302 \p p1 and \p p2 must be at least 8-byte aligned.
303*/
304SHZ_INLINE void shz_memswap32_1(void* SHZ_RESTRICT p1,
305 void* SHZ_RESTRICT p2) SHZ_NOEXCEPT;
306
307/*! Swaps the values within the given 32-byte buffers, using XMTRX.
308
309 Equivalent to shz_memcpy32_1(), except copying is done through XMTRX.
310
311 \warning
312 This routine clobbers XMTRX!
313*/
314SHZ_INLINE void shz_memswap32_1_xmtrx(void* SHZ_RESTRICT p1,
315 void* SHZ_RESTRICT p2) SHZ_NOEXCEPT;
316
317/*! Copies \p src to \p dst in a single 32-byte transaction using the Store Queues.
318
319 \note
320 The Store Queues bypass the SH4's data-cache! They are typically used to
321 transfer to *external memory* and are slower for memory-to-memory transactions.
322
323 \warning
324 \p dst must be at least 4-byte aligned, while \p src must be at least 8-byte aligned.
325
326 \sa shz_memcpy32()
327*/
328SHZ_INLINE void* shz_sq_memcpy32_1( void* SHZ_RESTRICT dst,
329 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
330
331
332/*! Copies \p src to \p dst in a single 32-byte transaction using the Store Queues and XMTRX.
333
334 Equivalent to shz_sq_memcpy32_1(), except copying is done through XMTRX.
335
336 \warning
337 This routine clobberx XMTRX.
338
339 \sa shz_memcpy32()
340*/
341SHZ_INLINE void* shz_sq_memcpy32_1_xmtrx( void* SHZ_RESTRICT dst,
342 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
343
344//! @}
345
346#include "inline/shz_mem.inl.h"
347
348SHZ_DECLS_END
349
350#endif
void * shz_memset8(void *dst, uint64_t value, size_t bytes) SHZ_NOEXCEPT
Assigns the given 8-byte value to the bytes in dst.
void * shz_memcpy4(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies a from one 4-byte aligned buffer to another 4 bytes at a time.
void * shz_sq_memcpy32_xmtrx(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies bytes from src to dst in 32-byte chunks, using the Store Queues and XMTRX.
void * shz_sq_memcpy32(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies bytes from src to dst in 32-byte chunks, using the Store Queues.
void shz_memcpy2_16(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 16 shorts from src to dst.
void shz_memswap32_1_xmtrx(void *SHZ_RESTRICT p1, void *SHZ_RESTRICT p2) SHZ_NOEXCEPT
Swaps the values within the given 32-byte buffers, using XMTRX.
void * shz_sq_memcpy32_1(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies src to dst in a single 32-byte transaction using the Store Queues.
void shz_memswap32_1(void *SHZ_RESTRICT p1, void *SHZ_RESTRICT p2) SHZ_NOEXCEPT
Swaps the values within the given 32-byte buffers.
void shz_dcache_alloc_line(void *src) SHZ_NOEXCEPT
Intrinsic around the SH4 MOVCA.L instruction.
void * shz_memcpy2(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies from one 2-byte aligned buffer to another two bytes at a time.
void * shz_sq_memcpy32_1_xmtrx(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies src to dst in a single 32-byte transaction using the Store Queues and XMTRX.
void * shz_memcpy64(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Specialized memcpy() variant for copying multiples of 64-bytes.
void * shz_memcpy1(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies an unaligned buffer to another one byte at a time.
void * shz_memcpy(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Generic drop-in fast memcpy() replacement.
void shz_memcpy32_1(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 32 bytes from p1 to p2 as a single chunk.
void shz_memcpy4_16(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 16 4-byte, long values from src to dst.
void * shz_memcpy32(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies bytes from the src to the dst buffer in 32-byte chunks.
void shz_memset2_16(void *dst, uint16_t value) SHZ_NOEXCEPT
Sets the values of the 16 shorts pointed to by dst to the given value.
void * shz_memcpy8(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies a from one 8-byte aligned buffer to another 8 bytes at a time.
void * shz_memcpy128(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Specialized memcpy() variant for copying multiples of 128 bytes.