SH4ZAM! 0.1.0
Fast math library for the Sega Dreamcast's SH4 CPU
Loading...
Searching...
No Matches
shz_mem.h
Go to the documentation of this file.
1/*! \file
2 * \brief Memory API
3 * \ingroup memory
4 *
5 * API built around copying, assigning, and working with memory.
6 *
7 * \todo
8 * - shz_macw()
9 * - shz_memset2()
10 * - shz_memset4()
11 * - shz_memset32()
12 * - shz_memset()
13 * - shz_memmoveN()
14 *
15 * \author 2025, 2026 Falco Girgis
16 * \author 2020 MoopTheHedgehog
17 *
18 * \copyright MIT License
19 */
20
21#ifndef SHZ_MEM_H
22#define SHZ_MEM_H
23
24#include "shz_cdefs.h"
25
26#include <stdbool.h>
27#include <stddef.h>
28
29/*! \defgroup memory Memory
30 \brief Routines for managing memory.
31
32 This API provides the following types of memory routines:
33 - special instruction intrinsics
34 - cache operations
35 - memcpy()-type routines
36
37 \note
38 memcpy()-like routines will typically always check for
39 proper alignment and size increments of parameters using
40 assert(), so make sure to build a release build (-DNDEBUG)
41 for maximal gainz, when not debugging.
42 */
43
44SHZ_DECLS_BEGIN
45
46/*! \name C stdlib Replacements
47 \brief Routine replacing the C standard library copy/set API.
48 @{
49*/
50
51/*! Generic drop-in fast memcpy() replacement.
52
53 Copies \p bytes from \p src to \p dst, determining the most efficient
54 specialization to call into at run-time, returning \p dst.
55
56 There are no alignment or size requirements for this routine.
57
58 \note
59 When you know of and can control the \p src and \p dst alignments and
60 batch sizes, you can micro-optimize by calling into the most specific
61 memcpy() specialization for your given scenario, over just using this
62 generic implementation, which must choose which one to use at run-time.
63
64 \warning
65 \p dst and \p src buffers should not be overlapping.
66
67 \sa shz_memcpy1(), shz_memcpy2(), shz_memcpy4(), shz_memcpy8(), shz_memcpy32(),
68 shz_memcpy64(), shz_memcpy128()
69*/
70SHZ_INLINE void* shz_memcpy( void* SHZ_RESTRICT dst,
71 const void* SHZ_RESTRICT src,
72 size_t bytes) SHZ_NOEXCEPT;
73
74/*! Generic drop-in fast memmove() replacement.
75
76 Copies \p bytes from \p src to \p dst, determining the most efficient
77 specialization to call into at run-time, return \p dst. The source and
78 destination buffers are allowed to overlap, making this routine slightly
79 less efficient, but more versatile than shz_memcpy().
80
81 \note
82 There is no alignment or size requirement for this routine.
83
84 \sa shz_memcpy()
85*/
86SHZ_INLINE void* shz_memmove(void* dst, const void* src, size_t bytes) SHZ_NOEXCEPT;
87
88//! @}
89
90/*! \name Specializations
91 \brief Specialized routines for specific sizes + alignments.
92 @{
93*/
94
95/*! Copies an unaligned buffer to another one byte at a time.
96
97 The \p dst pointer is returned.
98
99 \note
100 Typically, unless you know you are copying a tiny number of
101 definitely unaligned bytes, you want to use shz_memcpy(),
102 which automatically handles arbitrary alignment for you,
103 potentially more efficiently than copying byte-by-byte.
104
105 \warning
106 \p dst and \p src buffers should not be overlapping.
107
108 \sa shz_memcpy()
109*/
110SHZ_INLINE void* shz_memcpy1( void* SHZ_RESTRICT dst,
111 const void* SHZ_RESTRICT src,
112 size_t bytes) SHZ_NOEXCEPT;
113
114/*! Copies from one 2-byte aligned buffer to another two bytes at a time.
115
116 The \p dst pointer is returned.
117
118 \warning
119 \p dst and \p src must both be aligned by at least 2 bytes, and \p bytes
120 must be a multiple of 2.
121
122 \warning
123 \p dst and \p src buffers should not be overlapping.
124*/
125SHZ_INLINE void* shz_memcpy2( void* SHZ_RESTRICT dst,
126 const void* SHZ_RESTRICT src,
127 size_t bytes) SHZ_NOEXCEPT;
128
129/*! Copies a from one 4-byte aligned buffer to another 4 bytes at a time.
130
131 The \p dst buffer is returned.
132
133 \warning
134 \p dst and \p src must both be aligned by at least 4 bytes, and
135 \p bytes must be a multiple of 4.
136
137 \warning
138 \p dst and \p src buffers should not be overlapping.
139*/
140SHZ_INLINE void* shz_memcpy4( void* SHZ_RESTRICT dst,
141 const void* SHZ_RESTRICT src,
142 size_t bytes) SHZ_NOEXCEPT;
143
144/*! Copies a from one 8-byte aligned buffer to another 8 bytes at a time.
145
146 The \p dst buffer is returned.
147
148 \warning
149 \p dst and \p src must both be aligned by at least 8 bytes, and
150 \p bytes must be a multiple of 8.
151
152 \warning
153 \p src and \p dst should not overlap.
154*/
155SHZ_INLINE void* shz_memcpy8( void* SHZ_RESTRICT dst,
156 const void* SHZ_RESTRICT src,
157 size_t bytes) SHZ_NOEXCEPT;
158
159/*! Assigns the given 8-byte \p value to the \p bytes in \p dst.
160
161 \warning
162 \p dst should be at least 8-byte aligned, and \p bytes should be
163 a multiple of 8!
164*/
165SHZ_INLINE void* shz_memset8(void* dst, uint64_t value, size_t bytes) SHZ_NOEXCEPT;
166
167/*! Copies \p bytes from the \p src to the \p dst buffer in 32-byte chunks.
168
169 Transfers from 8-byte aligned buffer, \p src to 32-byte aligned buffer, \p dst,
170 32 bytes at a time. Returns the \p dst address.
171
172 \warning
173 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned. \p bytes must
174 be a multiple of 32.
175
176 \warning
177 \p src and \p dst buffers must not overlap.
178
179 \note
180 This is the quickest way to move 32-byte chunks of data around *within memory*, but
181 the shz_sq_memcpy32() will be faster when writing through the cache to external memory.
182
183 \sa shz_sq_memcpy32()
184*/
185SHZ_INLINE void* shz_memcpy32( void* SHZ_RESTRICT dst,
186 const void* SHZ_RESTRICT src,
187 size_t bytes) SHZ_NOEXCEPT;
188
189/*! Copies \p bytes from \p src to \p dst in 32-byte chunks, using the Store Queues.
190
191 Transfers from 8-byte aligned buffer, \p src to 4-byte aligned address, \p dst,
192 32 bytes at a time, writing through the cache, using the SH4's Store Queues.
193 Returns the \p dst address.
194
195 \warning
196 \p src must be at least 8-byte aligned, while \p dst can be only 4-byte aligned.
197 \p bytes must be a multiple of 32.
198
199 \note
200 This is the quickest way to move 32-byte chunks of data to *external memory*.
201 When copying to cached memory, you must invalidate the cache lines containing
202 \p dst before initiating the copy... Which means this routine becomes slower
203 than doing memory-to-memory copies with shz_memcpy32().
204
205 \sa shz_memcpy32(), shz_sq_memcpy32_1()
206*/
207SHZ_INLINE void* shz_sq_memcpy32( void* SHZ_RESTRICT dst,
208 const void* SHZ_RESTRICT src,
209 size_t bytes) SHZ_NOEXCEPT;
210
211/*! Copies \p bytes from \p src to \p dst in 32-byte chunks, using the Store Queues and XMTRX.
212
213 Equiavalent to shz_sq_memcpy32(), except copying is done through XMTRX.
214
215 \warning
216 This routine clobbers XMTRX.
217*/
218SHZ_INLINE void* shz_sq_memcpy32_xmtrx( void* SHZ_RESTRICT dst,
219 const void* SHZ_RESTRICT src,
220 size_t bytes) SHZ_NOEXCEPT;
221
222/*! Specialized memcpy() variant for copying multiples of 64-bytes.
223
224 Copies a from an 8-byte aligned buffer to a 32-byte aligned buffer, 64 bytes at a time.
225 Returns the \p dst address.
226
227 \warning
228 \p src and \p dst buffers must not overlap.
229
230 \warning
231 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned. \p bytes must
232 be a multiple of 64.
233*/
234SHZ_INLINE void* shz_memcpy64( void* SHZ_RESTRICT dst,
235 const void* SHZ_RESTRICT src,
236 size_t bytes) SHZ_NOEXCEPT;
237
238/*! Specialized memcpy() variant for copying multiples of 128 bytes.
239
240 Copies a from an 8-byte aligned buffer to a 32-byte aligned buffer, 128 bytes at a time.
241 Returns the \p dst address.
242
243 \warning
244 \p src and \p dst buffers must not overlap.
245
246 \warning
247 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned. \p bytes must
248 be a multiple of 128.
249*/
250SHZ_INLINE void* shz_memcpy128( void* SHZ_RESTRICT dst,
251 const void* SHZ_RESTRICT src,
252 size_t bytes) SHZ_NOEXCEPT;
253
254//! @}
255
256/*! \name Constant-sized Operations
257 \brief Specialized routines for operating on statically sized buffers.
258 @{
259*/
260
261/*! Copies 8 shorts from \p src to \p dst.
262
263 \warning
264 \p src and \p dst buffers must not overlap.
265
266 \warning
267 \p dst and \p src must both be aligned by at least two bytes.
268*/
269SHZ_INLINE void shz_memcpy2_8( void* SHZ_RESTRICT dst,
270 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
271
272/*! Copies 16 shorts from \p src to \p dst.
273
274 \warning
275 \p src and \p dst buffers must not overlap.
276
277 \warning
278 \p dst and \p src must both be aligned by at least two bytes.
279*/
280SHZ_INLINE void shz_memcpy2_16( void* SHZ_RESTRICT dst,
281 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
282
283/*! Sets the values of the 16 shorts pointed to by \p dst to the given \p value.
284
285 \warning
286 \p dst must be aligned by at least two bytes.
287*/
288SHZ_INLINE void shz_memset2_16(void* dst, uint16_t value) SHZ_NOEXCEPT;
289
290/*! Copies 16 4-byte, long values from \p src to \p dst.
291
292 \warning
293 \p src and \p dst buffers must not overlap.
294
295 \warning
296 The \p src and \p dst buffers must both be at least 4-byte aligned.
297*/
298SHZ_INLINE void shz_memcpy4_16( void* SHZ_RESTRICT dst,
299 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
300
301/*! Copies 32 bytes from \p p1 to \p p2 as a single chunk.
302
303 \warning
304 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned.
305*/
306SHZ_INLINE void shz_memcpy32_1( void* SHZ_RESTRICT dst,
307 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
308
309/*! Swaps the values within the given 32-byte buffers.
310
311 \warning
312 \p p1 and \p p2 must be at least 8-byte aligned.
313*/
314SHZ_INLINE void shz_memswap32_1(void* SHZ_RESTRICT p1,
315 void* SHZ_RESTRICT p2) SHZ_NOEXCEPT;
316
317/*! Swaps the values within the given 32-byte buffers, using XMTRX.
318
319 Equivalent to shz_memcpy32_1(), except copying is done through XMTRX.
320
321 \warning
322 This routine clobbers XMTRX!
323*/
324SHZ_INLINE void shz_memswap32_1_xmtrx(void* SHZ_RESTRICT p1,
325 void* SHZ_RESTRICT p2) SHZ_NOEXCEPT;
326
327/*! Copies \p src to \p dst in a single 32-byte transaction using the Store Queues.
328
329 \note
330 The Store Queues bypass the SH4's data-cache! They are typically used to
331 transfer to *external memory* and are slower for memory-to-memory transactions.
332
333 \warning
334 \p dst must be at least 4-byte aligned, while \p src must be at least 8-byte aligned.
335
336 \sa shz_memcpy32()
337*/
338SHZ_INLINE void* shz_sq_memcpy32_1( void* SHZ_RESTRICT dst,
339 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
340
341
342/*! Copies \p src to \p dst in a single 32-byte transaction using the Store Queues and XMTRX.
343
344 Equivalent to shz_sq_memcpy32_1(), except copying is done through XMTRX.
345
346 \warning
347 This routine clobberx XMTRX.
348
349 \sa shz_memcpy32()
350*/
351SHZ_INLINE void* shz_sq_memcpy32_1_xmtrx( void* SHZ_RESTRICT dst,
352 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
353
354/*! Intrinsic around the SH4 `MOVCA.L` instruction.
355
356 Preallocates the cache-line containing \p src.
357
358 Zero-initializes all 32-bytes within the \p src cache-line,
359 setting the valid bit to `1`.
360*/
361SHZ_INLINE void shz_dcache_alloc_line(void* src) SHZ_NOEXCEPT;
362
363//! @}
364
365#include "inline/shz_mem.inl.h"
366
367SHZ_DECLS_END
368
369#endif
void * shz_memset8(void *dst, uint64_t value, size_t bytes) SHZ_NOEXCEPT
Assigns the given 8-byte value to the bytes in dst.
void * shz_memcpy4(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies a from one 4-byte aligned buffer to another 4 bytes at a time.
void * shz_sq_memcpy32_xmtrx(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies bytes from src to dst in 32-byte chunks, using the Store Queues and XMTRX.
void * shz_sq_memcpy32(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies bytes from src to dst in 32-byte chunks, using the Store Queues.
void shz_memcpy2_16(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 16 shorts from src to dst.
void shz_memswap32_1_xmtrx(void *SHZ_RESTRICT p1, void *SHZ_RESTRICT p2) SHZ_NOEXCEPT
Swaps the values within the given 32-byte buffers, using XMTRX.
void * shz_sq_memcpy32_1(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies src to dst in a single 32-byte transaction using the Store Queues.
void shz_memcpy2_8(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 8 shorts from src to dst.
void shz_memswap32_1(void *SHZ_RESTRICT p1, void *SHZ_RESTRICT p2) SHZ_NOEXCEPT
Swaps the values within the given 32-byte buffers.
void shz_dcache_alloc_line(void *src) SHZ_NOEXCEPT
Intrinsic around the SH4 MOVCA.L instruction.
void * shz_memcpy2(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies from one 2-byte aligned buffer to another two bytes at a time.
void * shz_sq_memcpy32_1_xmtrx(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies src to dst in a single 32-byte transaction using the Store Queues and XMTRX.
void * shz_memmove(void *dst, const void *src, size_t bytes) SHZ_NOEXCEPT
Generic drop-in fast memmove() replacement.
void * shz_memcpy64(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Specialized memcpy() variant for copying multiples of 64-bytes.
void * shz_memcpy1(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies an unaligned buffer to another one byte at a time.
void * shz_memcpy(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Generic drop-in fast memcpy() replacement.
void shz_memcpy32_1(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 32 bytes from p1 to p2 as a single chunk.
void shz_memcpy4_16(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 16 4-byte, long values from src to dst.
void * shz_memcpy32(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies bytes from the src to the dst buffer in 32-byte chunks.
void shz_memset2_16(void *dst, uint16_t value) SHZ_NOEXCEPT
Sets the values of the 16 shorts pointed to by dst to the given value.
void * shz_memcpy8(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies a from one 8-byte aligned buffer to another 8 bytes at a time.
void * shz_memcpy128(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Specialized memcpy() variant for copying multiples of 128 bytes.