1 /** A GC with segregated allocations using D's `static foreach`.
2  *
3  * Copyright: Copyright Per Nordlöw 2019 - .
4  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
5  * Authors:   Per Nordlöw
6  */
7 module segregated_gc;
8 
9 private static void *os_mem_map(size_t nbytes) nothrow @nogc
10 {   void *p;
11 
12     import core.sys.posix.sys.mman;
13     p = mmap(null, nbytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
14     return (p == MAP_FAILED) ? null : p;
15 }
16 
17 private static int os_mem_unmap(void *base, size_t nbytes) nothrow @nogc
18 {
19     import core.sys.posix.sys.mman;
20     return munmap(base, nbytes);
21 }
22 
23 // import gc.os : os_mem_map, os_mem_unmap;
24 
25 import core.gc.config;
26 import core.gc.gcinterface;
27 
28 import paged_dynamic_array : Array = PagedDynamicArray;
29 import simple_static_bitarray : StaticBitArray;
30 
31 import core.stdc.stdio: printf;
32 import cstdlib = core.stdc.stdlib : calloc, free, malloc, realloc;
33 static import core.memory;
34 
35 /* debug = PRINTF; */
36 
37 extern (C) void onOutOfMemoryError(void* pretend_sideffect = null) @trusted pure nothrow @nogc; /* dmd @@@BUG11461@@@ */
38 
39 private
40 {
41     extern (C)
42     {
43         // to allow compilation of this module without access to the rt package,
44         //  make these functions available from rt.lifetime
45         void rt_finalizeFromGC(void* p, size_t size, uint attr) nothrow;
46         int rt_hasFinalizerInSegment(void* p, size_t size, uint attr, in void[] segment) nothrow;
47 
48         // Declared as an extern instead of importing core.exception
49         // to avoid inlining - see issue 13725.
50         void onInvalidMemoryOperationError() @nogc nothrow;
51         void onOutOfMemoryErrorNoGC() @nogc nothrow;
52     }
53 }
54 
55 enum WORDSIZE = size_t.sizeof;  ///< Size of word type (size_t).
56 enum PAGESIZE = 4096;           ///< Page size in bytes. Linux $(shell getconf PAGESIZE).
57 
58 /// Small slot sizes classes (in bytes).
59 static immutable smallSizeClasses = [8,
60                                      16,
61                                      24, // TODO: move
62                                      32,
63                                      40, // TODO: move
64                                      48, // TODO: move
65                                      56, // TODO: move
66                                      64,
67                                      72,
68                                      80,
69                                      88,
70                                      96, // TODO: move
71                                      104, // TODO: move
72                                      112, // TODO: move
73                                      120, // TODO: move
74                                      128, // TODO: 128 +64,
75                                      256, // TODO: 256 + 128,
76                                      512, // TODO: 512 + 256,
77                                      1024, // TODO: 1024 + 512,
78                                      2048, // TODO: 2048 + 1024,
79     ];
80 
81 /// Medium slot sizes classes (in bytes).
82 version(none)
83 static immutable mediumSizeClasses = [1 << 12, // 4096
84                                       1 << 13, // 8192
85                                       1 << 14, // 16384
86                                       1 << 15, // 32768
87                                       1 << 16, // 65536
88     ];
89 
90 /// Ceiling to closest to size class of `sz`.
91 size_t ceilPow2(size_t sz) @safe pure nothrow @nogc
92 {
93     return nextPow2(sz - 1);
94 }
95 
96 @safe pure nothrow @nogc unittest
97 {
98     // TODO: assert(ceilPow2(1) == 1);
99     assert(ceilPow2(2) == 2);
100     assert(ceilPow2(3) == 4);
101     assert(ceilPow2(4) == 4);
102     assert(ceilPow2(5) == 8);
103     assert(ceilPow2(6) == 8);
104     assert(ceilPow2(7) == 8);
105     assert(ceilPow2(8) == 8);
106     assert(ceilPow2(9) == 16);
107 }
108 
109 /// Small slot foreach slot contains `wordCount` machine words.
110 struct SmallSlot(uint wordCount)
111 if (wordCount >= 1)
112 {
113     size_t[wordCount] words;    // words
114 }
115 
116 @safe pure nothrow @nogc unittest
117 {
118     SmallSlot!(1) x;
119     SmallSlot!(1) y;
120 }
121 
122 /// Small page storing slots of size `sizeClass`.
123 struct SmallPage(uint sizeClass)
124 if (sizeClass >= smallSizeClasses[0])
125 {
126     enum wordCount = sizeClass/WORDSIZE;
127     static assert(sizeClass % WORDSIZE == 0, sizeClass);
128     enum slotCount = PAGESIZE/sizeClass;
129     alias Slot = SmallSlot!(wordCount);
130 
131     Slot[slotCount] slots;
132     byte[PAGESIZE-slots.sizeof] __padding;
133     static assert(this.sizeof == PAGESIZE); // TODO: adjust if pages of different byte sizes are preferred
134 }
135 enum minimumSmallPageWordCount = PAGESIZE/WORDSIZE; // TODO: may be computed
136 
137 struct SmallPageTable(uint sizeClass)
138 {
139     alias Page = SmallPage!(sizeClass);
140     Page* pagePtr;
141     enum slotCount = PAGESIZE/sizeClass;
142 
143     // bit `i` indicates if slot `i` in `*pagePtr` currently contains an initialized value
144     StaticBitArray!(slotCount) slotUsages; // TODO: benchmark with a byte-array instead for comparison
145 
146     // bit `i` indicates if slot `i` in `*pagePtr` has been marked
147     StaticBitArray!(slotCount) slotMarks;
148 }
149 
150 /// Small pool of pages.
151 struct SmallPool(uint sizeClass, bool pointerFlag)
152 if (sizeClass >= smallSizeClasses[0])
153 {
154     alias Page = SmallPage!(sizeClass);
155 
156     this(size_t pageTableCapacity)
157     {
158         pageTables.capacity = pageTableCapacity;
159     }
160 
161     void* allocateNext() @trusted // pure nothrow @nogc
162     {
163         version(LDC) pragma(inline, true);
164 
165         // TODO: scan `slotUsages` at slotIndex using core.bitop.bsf to find
166         // first free page if any. Use modification of `indexOfFirstOne` that
167         // takes startIndex being `slotIndex` If no hit set `slotIndex` to
168         // `Page.slotCount`
169         // TODO: instead of this find next set bit at `slotIndex` in
170         // `slotUsages` unless whole current `slotUsages`-word is all zero.
171 
172         immutable pageIndex = slotIndex / Page.slotCount;
173         immutable needNewPage = (slotIndex % Page.slotCount == 0);
174 
175         if (needNewPage)
176         {
177             Page* pagePtr = cast(Page*)os_mem_map(PAGESIZE);
178             debug(PRINTF) printf("### %s(): pagePtr:%p\n", __FUNCTION__.ptr, pagePtr);
179             pageTables.insertBack(SmallPageTable!sizeClass(pagePtr));
180 
181             pageTables.ptr[pageIndex].slotUsages[0] = true; // mark slot
182 
183             debug(PRINTF) printf("### %s(): slotIndex:%lu\n", __FUNCTION__.ptr, 0);
184 
185             auto slotPtr = pagePtr.slots.ptr; // first slot
186             slotIndex = 1;
187             return slotPtr;
188         }
189         else
190         {
191             debug(PRINTF) printf("### %s(): slotIndex:%lu\n", __FUNCTION__.ptr, slotIndex);
192             pageTables.ptr[pageIndex].slotUsages[slotIndex] = true; // mark slot
193             return &pageTables.ptr[pageIndex].pagePtr.slots.ptr[slotIndex++];
194         }
195     }
196 
197     Array!(SmallPageTable!sizeClass) pageTables;
198     size_t slotIndex = 0;       // index to first free slot in pool across multiple page
199 }
200 
201 // TODO: @safe pure nothrow @nogc
202 unittest
203 {
204     static foreach (sizeClass; smallSizeClasses)
205     {
206         {
207             SmallPool!(sizeClass, false) x;
208         }
209     }
210 }
211 
212 /// All small pools.
213 struct SmallPools
214 {
215     this(size_t pageTableCapacity)
216     {
217         static foreach (sizeClass; smallSizeClasses)
218         {
219             // Quote from https://olshansky.me/gc/runtime/dlang/2017/06/14/inside-d-gc.html
220             // "Fine grained locking from the start, I see no problem with per pool locking."
221             mixin(`this.unscannedPool` ~ sizeClass.stringof ~ ` = SmallPool!(sizeClass, false)(pageTableCapacity);`);
222             mixin(`this.scannedPool`    ~ sizeClass.stringof ~ ` = SmallPool!(sizeClass, true)(pageTableCapacity);`);
223         }
224     }
225     BlkInfo qalloc(size_t size, uint bits) nothrow
226     {
227         debug(PRINTF) printf("### %s(size:%lu, bits:%u)\n", __FUNCTION__.ptr, size, bits);
228 
229         BlkInfo blkinfo = void;
230         blkinfo.attr = bits;
231 
232         // TODO: optimize this:
233         blkinfo.size = ceilPow2(size);
234         if (blkinfo.size < smallSizeClasses[0])
235             blkinfo.size = smallSizeClasses[0];
236 
237     top:
238         switch (blkinfo.size)
239         {
240             static foreach (sizeClass; smallSizeClasses)
241             {
242             case sizeClass:
243                 if (bits & BlkAttr.NO_SCAN) // no scanning needed
244                     mixin(`blkinfo.base = unscannedPool` ~ sizeClass.stringof ~ `.allocateNext();`);
245                 else
246                     mixin(`blkinfo.base = scannedPool` ~ sizeClass.stringof ~ `.allocateNext();`);
247                 break top;
248             }
249         default:
250             blkinfo.base = null;
251             printf("### %s(size:%lu, bits:%u) Cannot handle blkinfo.size:%lu\n", __FUNCTION__.ptr, size, bits, blkinfo.size);
252             // TODO: find closest match of blkinfo.size <= smallSizeClasses
253             onOutOfMemoryError();
254             assert(0, "Handle other blkinfo.size");
255         }
256 
257         return blkinfo;
258     }
259 private:
260     static foreach (sizeClass; smallSizeClasses)
261     {
262         // Quote from https://olshansky.me/gc/runtime/dlang/2017/06/14/inside-d-gc.html
263         // "Fine grained locking from the start, I see no problem with per pool locking."
264         mixin(`SmallPool!(sizeClass, false) unscannedPool` ~ sizeClass.stringof ~ `;`);
265         mixin(`SmallPool!(sizeClass, true) scannedPool` ~ sizeClass.stringof ~ `;`);
266     }
267 }
268 // pragma(msg, "SmallPools.sizeof: ", SmallPools.sizeof);
269 
270 enum pageTableCapacityDefault = 256*PAGESIZE; // eight pages
271 
272 struct Gcx
273 {
274     this(size_t pageTableCapacity) // 1 one megabyte per table
275     {
276         this.smallPools = SmallPools(pageTableCapacity);
277     }
278     Array!Root roots;
279     Array!Range ranges;
280     SmallPools smallPools;
281     uint disabled; // turn off collections if >0
282 }
283 
284 Gcx tlGcx;                      // thread-local allocator instance
285 static this()
286 {
287     tlGcx = Gcx(pageTableCapacityDefault);
288 }
289 
290 // size class specific overloads only for thread-local GC
291 extern (C)
292 {
293     static foreach (sizeClass; smallSizeClasses)
294     {
295         /* TODO: use template `mixin` containing, in turn, a `mixin` for generating
296          * the symbol names `gc_tlmalloc_32`, `unscannedPool32` and
297          * `scannedPool32` for sizeClass `32`.
298          *
299          * TODO: Since https://github.com/dlang/dmd/pull/8813 we can now use:
300          * `mixin("gc_tlmalloc_", sizeClass);` for symbol generation
301          */
302         mixin(`
303         void* gc_tlmalloc_` ~ sizeClass.stringof ~ `(uint ba = 0) @trusted nothrow
304         {
305             if (ba & BlkAttr.NO_SCAN) // no scanning needed
306                 return tlGcx.smallPools.unscannedPool` ~ sizeClass.stringof ~ `.allocateNext();
307             else
308                 return tlGcx.smallPools.scannedPool` ~ sizeClass.stringof ~ `.allocateNext();
309         }
310 `);
311     }
312 }
313 
314 class SegregatedGC : GC
315 {
316     import core.internal.spinlock;
317     static gcLock = shared(AlignedSpinLock)(SpinLock.Contention.lengthy);
318     static bool _inFinalizer;
319 
320     // global allocator (`__gshared`)
321     __gshared Gcx gGcx;
322 
323     // lock GC, throw InvalidMemoryOperationError on recursive locking during finalization
324     static void lockNR() @nogc nothrow
325     {
326         if (_inFinalizer)
327             onInvalidMemoryOperationError();
328         gcLock.lock();
329     }
330 
331     void initialize()
332     {
333         printf("### %s()\n", __FUNCTION__.ptr);
334 
335         import core.stdc.string;
336         auto p = cstdlib.malloc(__traits(classInstanceSize, SegregatedGC));
337         if (!p)
338             onOutOfMemoryError();
339 
340         auto init = typeid(SegregatedGC).initializer();
341         assert(init.length == __traits(classInstanceSize, SegregatedGC));
342         auto instance = cast(SegregatedGC)memcpy(p, init.ptr, init.length);
343         instance.__ctor();
344 
345         instance.gGcx = Gcx(pageTableCapacityDefault);
346     }
347 
348     void finalize()
349     {
350         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
351     }
352 
353     this()
354     {
355     }
356 
357     void enable()
358     {
359         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
360         static void go(Gcx* tlGcx) nothrow
361         {
362             tlGcx.disabled--;
363         }
364         runLocked!(go)(&tlGcx);
365     }
366 
367     void disable()
368     {
369         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
370         static void go(Gcx* tlGcx) nothrow
371         {
372             tlGcx.disabled++;
373         }
374         runLocked!(go)(&tlGcx);
375     }
376 
377     auto runLocked(alias func, Args...)(auto ref Args args)
378     {
379         debug(PROFILE_API) immutable tm = (config.profile > 1 ? currTime.ticks : 0);
380         lockNR();
381         scope (failure) gcLock.unlock();
382         debug(PROFILE_API) immutable tm2 = (config.profile > 1 ? currTime.ticks : 0);
383 
384         static if (is(typeof(func(args)) == void))
385             func(args);
386         else
387             auto res = func(args);
388 
389         debug(PROFILE_API) if (config.profile > 1) { lockTime += tm2 - tm; }
390         gcLock.unlock();
391 
392         static if (!is(typeof(func(args)) == void))
393             return res;
394     }
395 
396     void collect() nothrow
397     {
398         printf("### %s: \n", __FUNCTION__.ptr);
399     }
400 
401     void collectNoStack() nothrow
402     {
403         printf("### %s: \n", __FUNCTION__.ptr);
404     }
405 
406     void minimize() nothrow
407     {
408         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
409     }
410 
411     uint getAttr(void* p) nothrow
412     {
413         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
414         return 0;
415     }
416 
417     uint setAttr(void* p, uint mask) nothrow
418     {
419         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
420         return 0;
421     }
422 
423     uint clrAttr(void* p, uint mask) nothrow
424     {
425         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
426         return 0;
427     }
428 
429     void* malloc(size_t size, uint bits, const TypeInfo ti) nothrow
430     {
431         debug(PRINTF) printf("### %s(size:%lu, bits:%u)\n", __FUNCTION__.ptr, size, bits);
432         lockNR();
433         scope (failure) gcLock.unlock(); // TODO: why needed?
434         void* p = gGcx.smallPools.qalloc(size, bits).base;
435         gcLock.unlock();
436         if (size && p is null)
437             onOutOfMemoryError();
438         return p;
439     }
440 
441     BlkInfo qalloc(size_t size, uint bits, const TypeInfo ti) nothrow
442     {
443         debug(PRINTF) printf("### %s(size:%lu, bits:%u)\n", __FUNCTION__.ptr, size, bits);
444         lockNR();
445         scope (failure) gcLock.unlock(); // TODO: why needed?
446         BlkInfo blkinfo = gGcx.smallPools.qalloc(size, bits);
447         gcLock.unlock();
448         if (size && blkinfo.base is null)
449             onOutOfMemoryError();
450         return blkinfo;
451     }
452 
453     void* calloc(size_t size, uint bits, const TypeInfo ti) nothrow
454     {
455         debug(PRINTF) printf("### %s(size:%lu, bits:%u)\n", __FUNCTION__.ptr, size, bits);
456         lockNR();
457         scope (failure) gcLock.unlock();
458         void* p = gGcx.smallPools.qalloc(size, bits).base;
459         gcLock.unlock();
460         if (size && p is null)
461             onOutOfMemoryError();
462         import core.stdc.string : memset;
463         memset(p, 0, size);     // zero
464         // why is this slower than memset? (cast(size_t*)p)[0 .. size/size_t.sizeof] = 0;
465         return p;
466     }
467 
468     void* realloc(void* p, size_t size, uint bits, const TypeInfo ti) nothrow
469     {
470         debug(PRINTF) printf("### %s(p:%p, size:%lu, bits:%u)\n", __FUNCTION__.ptr, p, size, bits);
471         p = cstdlib.realloc(p, size);
472         if (size && p is null)
473             onOutOfMemoryError();
474         return p;
475     }
476 
477     /**
478      * Attempt to in-place enlarge the memory block pointed to by p by at least
479      * minsize bytes, up to a maximum of maxsize additional bytes.
480      * This does not attempt to move the memory block (like realloc() does).
481      *
482      * Returns:
483      *  0 if could not extend p,
484      *  total size of entire memory block if successful.
485      */
486     size_t extend(void* p, size_t minsize, size_t maxsize, const TypeInfo ti) nothrow
487     {
488         debug(PRINTF) printf("### %s(p:%p, minsize:%lu, maxsize:%lu)\n", __FUNCTION__.ptr, p, minsize, maxsize);
489         return 0;
490     }
491 
492     size_t reserve(size_t size) nothrow
493     {
494         debug(PRINTF) printf("### %s(size:%lu)\n", __FUNCTION__.ptr, size);
495         return 0;
496     }
497 
498     void free(void* p) nothrow @nogc
499     {
500         debug(PRINTF) printf("### %s(p:%p)\n", __FUNCTION__.ptr, p);
501         cstdlib.free(p);
502     }
503 
504     /**
505      * Determine the base address of the block containing p.  If p is not a gc
506      * allocated pointer, return null.
507      */
508     void* addrOf(void* p) nothrow @nogc
509     {
510         debug(PRINTF) printf("### %s(p:%p)\n", __FUNCTION__.ptr, p);
511         return null;
512     }
513 
514     /**
515      * Determine the allocated size of pointer p.  If p is an interior pointer
516      * or not a gc allocated pointer, return 0.
517      */
518     size_t sizeOf(void* p) nothrow @nogc
519     {
520         debug(PRINTF) printf("### %s(p:%p)\n", __FUNCTION__.ptr, p);
521         return 0;
522     }
523 
524     /**
525      * Determine the base address of the block containing p.  If p is not a gc
526      * allocated pointer, return null.
527      */
528     BlkInfo query(void* p) nothrow
529     {
530         debug(PRINTF) printf("### %s(p:%p)\n", __FUNCTION__.ptr, p);
531         return BlkInfo.init;
532     }
533 
534     void addRoot(void* p) nothrow @nogc
535     {
536         printf("### %s(p:%p)\n", __FUNCTION__.ptr, p);
537         tlGcx.roots.insertBack(Root(p));
538     }
539 
540     /**
541      * remove p from list of roots
542      */
543     void removeRoot(void* p) nothrow @nogc
544     {
545         debug(PRINTF) printf("### %s(p:%p)\n", __FUNCTION__.ptr, p);
546         foreach (ref root; tlGcx.roots)
547         {
548             if (root is p)
549             {
550                 root = tlGcx.roots.back;
551                 tlGcx.roots.popBack();
552                 return;
553             }
554         }
555         assert(false);
556     }
557 
558     @property RootIterator rootIter() return @nogc
559     {
560         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
561         return &rootsApply;
562     }
563 
564     private int rootsApply(scope int delegate(ref Root) nothrow dg)
565     {
566         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
567         foreach (ref r; tlGcx.roots)
568             if (auto result = dg(r))
569                 return result;
570         return 0;
571     }
572 
573     /**
574      * Add range to scan for roots.
575      */
576     void addRange(void* p, size_t sz, const TypeInfo ti = null) nothrow @nogc
577     {
578         int x;
579         printf("### %s(p:%p, sz:%lu ti:%p, stack:%p)\n", __FUNCTION__.ptr, p, sz, ti, &x);
580         if (p is null)
581             return;
582         tlGcx.ranges.insertBack(Range(p, p + sz, cast() ti));
583     }
584 
585     /**
586      * Remove range `p`.
587      */
588     void removeRange(void* p) nothrow @nogc
589     {
590         debug(PRINTF) printf("### %s(p:%p)\n", __FUNCTION__.ptr, p);
591         if (p is null)
592             return;
593         foreach (ref range; tlGcx.ranges)
594         {
595             if (range.pbot is p)
596             {
597                 range = tlGcx.ranges.back;
598                 tlGcx.ranges.popBack();
599                 return;
600             }
601         }
602         assert(false);
603     }
604 
605     @property RangeIterator rangeIter() return @nogc
606     {
607         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
608         return &rangesApply;
609     }
610 
611     private int rangesApply(scope int delegate(ref Range) nothrow dg)
612     {
613         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
614         foreach (ref range; tlGcx.ranges)
615             if (auto result = dg(range))
616                 return result;
617         return 0;
618     }
619 
620     /**
621      * Run finalizers.
622      */
623     void runFinalizers(const scope void[] segment) nothrow
624     {
625         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
626     }
627 
628     bool inFinalizer() nothrow
629     {
630         return typeof(return).init;
631     }
632 
633     /**
634      * Retrieve statistics about garbage collection.
635      * Useful for debugging and tuning.
636      */
637     core.memory.GC.Stats stats() nothrow
638     {
639         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
640         // TODO: fill in
641         return typeof(return)();
642     }
643 
644     /**
645      * Retrieve profile statistics about garbage collection.
646      * Useful for debugging and tuning.
647      */
648     core.memory.GC.ProfileStats profileStats() nothrow
649     {
650         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
651         // TODO: fill in
652         return typeof(return)();
653     }
654 
655     /**
656      * Returns the number of bytes allocated for the current thread
657      * since program start. It is the same as
658      * GC.stats().allocatedInCurrentThread, but faster.
659      */
660     ulong allocatedInCurrentThread() nothrow
661     {
662         debug(PRINTF) printf("### %s: \n", __FUNCTION__.ptr);
663         // TODO: fill in
664         return typeof(return).init;
665     }
666 }
667 
668 private enum PowType
669 {
670     floor,
671     ceil
672 }
673 
674 private T powIntegralImpl(PowType type, T)(T val)
675 {
676     version(D_Coverage) {} else pragma(inline, true);
677     import core.bitop : bsr;
678     if (val == 0 ||
679         (type == PowType.ceil &&
680          (val > T.max / 2 ||
681           val == T.min)))
682         return 0;
683     else
684         return (T(1) << bsr(val) + type);
685 }
686 
687 private T nextPow2(T)(const T val)
688 if (is(T == size_t) ||
689     is(T == uint))
690 {
691     return powIntegralImpl!(PowType.ceil)(val);
692 }