From: chris.m.thomasson.1@gmail.com   
      
   On 12/30/2025 12:59 PM, BGB wrote:   
   > On 12/30/2025 12:00 PM, Scott Lurndal wrote:   
   >> "Chris M. Thomasson" writes:   
   >>> On 12/28/2025 4:41 PM, BGB wrote:   
   >>> [...]   
   >>>   
   >>> Also, if using something like LOCK CMPXCHG you MUST make sure to align   
   >>> and pad your relevant data structures to a l2 cache line.   
   >>   
   >> That may not be necessary if there is otherwise no false sharing in   
   >> the same cache line. Yes, the operand should be naturally aligned,   
   >> (which ensures it is entirely contained within a single cache line),   
   >> but there's no reason that other data cannot be stored in the same   
   >> cache line, so long as it is unlikely to be accessed by a competing   
   >> thread.   
   >>   
   >   
   > Yes, or the "small brain" option of just making the mutex larger than   
   > the size of the cache line and putting the relevant part in the middle...   
   >   
   > struct PaddedMutex_s {   
   > u64 pad1, pad2, pad3;   
   > u64 real_part;   
   > u64 pad4, pad5, pad6;   
   > };   
   >   
   > Then say (assuming a 32 byte cache line), no non-pad values can be in   
   > the same cache line as real_part.   
   >   
   > Little bigger for a 64 byte cache line, but same general idea.   
      
   :^) Yeah. That can help. I was referring to the anchor of, say a   
   lock-free stack. That anchor better be aligned and padded. An anchor:   
      
   struct ct_anchor   
   {   
    struct node* next;   
    uintptr_t ver;   
   };   
      
   ct_anchor is (better be ;^) a double word ripe for a DWCAS say, LOCK   
   CMPXCHG8B on a 32 bit system.   
      
   that ct_anchor needs to be properly aligned and padded up to a l2 cache   
   line. LL/SC is a different story. The version is not needed because a   
   proper LL/SC gets around ABA. But! That single word should be padded and   
   aligned on a reservation granule.   
      
   Now, the struct node's. Heck they can be l2 cache line aligned and   
   padded regions of memory. Say a l2 cacheblock lock free allocator.   
      
   Fwiw, here is some of my old code test of a region allocator that can   
   help align things. This was before std alignment (say, _Alignof) support   
   was widely supported:   
      
      
   #if ! defined (RALLOC_H)   
   # define RALLOC_H   
   # if defined (__cplusplus)   
    extern "C" {   
   # endif   
   /**************************************************************/   
      
      
      
      
   #include    
   #include    
      
      
      
      
   #if defined (_MSC_VER)   
   /* warning C4116: unnamed type definition in parentheses */   
   # pragma warning (disable : 4116)   
   #endif   
      
      
      
      
   #if ! defined (NDEBUG)   
   # include    
   # define RALLOC_DBG_PRINTF(mp_exp) printf mp_exp   
   #else   
   # define RALLOC_DBG_PRINTF(mp_exp) ((void)0)   
   #endif   
      
      
      
      
   #if ! defined (RALLOC_UINTPTR_TYPE)   
   # define RALLOC_UINTPTR_TYPE size_t   
   #endif   
      
      
      
      
   typedef RALLOC_UINTPTR_TYPE ralloc_uintptr_type;   
      
      
   typedef char ralloc_static_assert[   
    sizeof(ralloc_uintptr_type) == sizeof(void*) ? 1 : -1   
   ];   
      
      
      
      
   enum ralloc_align_enum {   
    ALIGN_ENUM   
   };   
      
      
   struct ralloc_align_struct {   
    char pad;   
    double type;   
   };   
      
      
   union ralloc_align_max {   
    char char_;   
    short int short_;   
    int int_;   
    long int long_;   
    float float_;   
    double double_;   
    long double long_double_;   
    void* ptr_;   
    void* (*fptr_) (void*);   
    enum ralloc_align_enum enum_;   
    struct ralloc_align_struct struct_;   
    size_t size_t_;   
    ptrdiff_t ptrdiff_t;   
   };   
      
      
   #define RALLOC_ALIGN_OF(mp_type) \   
    offsetof( \   
    struct { \   
    char pad_RALLOC_ALIGN_OF; \   
    mp_type type_RALLOC_ALIGN_OF; \   
    }, \   
    type_RALLOC_ALIGN_OF \   
    )   
      
      
   #define RALLOC_ALIGN_MAX RALLOC_ALIGN_OF(union ralloc_align_max)   
      
      
   #define RALLOC_ALIGN_UP(mp_ptr, mp_align) \   
    ((void*)( \   
    (((ralloc_uintptr_type)(mp_ptr)) + ((mp_align) - 1)) \   
    & ~(((mp_align) - 1)) \   
    ))   
      
      
   #define RALLOC_ALIGN_ASSERT(mp_ptr, mp_align) \   
    (((void*)(mp_ptr)) == RALLOC_ALIGN_UP(mp_ptr, mp_align))   
      
      
      
      
   struct region {   
    unsigned char* buffer;   
    size_t size;   
    size_t offset;   
   };   
      
      
   static void   
   rinit(   
    struct region* const self,   
    void* buffer,   
    size_t size   
   ) {   
    self->buffer = buffer;   
    self->size = size;   
    self->offset = 0;   
      
    RALLOC_DBG_PRINTF((   
    "rinit(%p) {\n"   
    " buffer = %p\n"   
    " size = %lu\n"   
    "}\n\n\n",   
    (void*)self,   
    buffer,   
    (unsigned long int)size   
    ));   
   }   
      
      
   static void*   
   rallocex(   
    struct region* const self,   
    size_t size,   
    size_t align   
   ) {   
    unsigned char* align_buffer;   
    size_t offset = self->offset;   
    unsigned char* raw_buffer = self->buffer + offset;   
      
    if (! size) {   
    size = 1;   
    }   
      
    if (! align) {   
    align = RALLOC_ALIGN_MAX;   
    }   
      
    assert(align == 1 || RALLOC_ALIGN_ASSERT(align, 2));   
      
    align_buffer = RALLOC_ALIGN_UP(raw_buffer, align);   
      
    assert(RALLOC_ALIGN_ASSERT(align_buffer, align));   
      
    size += align_buffer - raw_buffer;   
      
    if (offset + size > self->size) {   
    return NULL;   
    }   
      
    self->offset = offset + size;   
      
    RALLOC_DBG_PRINTF((   
    "rallocex(%p) {\n"   
    " size = %lu\n"   
    " alignment = %lu\n"   
    " origin offset = %lu\n"   
    " final offset = %lu\n"   
    " raw_buffer = %p\n"   
    " align_buffer = %p\n"   
    " size adjustment = %lu\n"   
    " final size = %lu\n"   
    "}\n\n\n",   
    (void*)self,   
    (unsigned long int)size - (align_buffer - raw_buffer),   
    (unsigned long int)align,   
    (unsigned long int)offset,   
    (unsigned long int)self->offset,   
    (void*)raw_buffer,   
    (void*)align_buffer,   
    (unsigned long int)(align_buffer - raw_buffer),   
    (unsigned long int)size   
    ));   
      
    return align_buffer;   
   }   
      
      
   #define ralloc(mp_self, mp_size) \   
    rallocex((mp_self), (mp_size), RALLOC_ALIGN_MAX)   
      
   #define ralloct(mp_self, mp_count, mp_type) \   
    rallocex( \   
    (mp_self), \   
    sizeof(mp_type) * (mp_count),\   
    RALLOC_ALIGN_OF(mp_type) \   
    )   
      
      
   static void   
   rflush(   
    struct region* const self   
   ) {   
    self->offset = 0;   
      
    RALLOC_DBG_PRINTF((   
    "rflush(%p) {}\n\n\n",   
    (void*)self   
    ));   
   }   
      
      
      
      
   #undef RALLOC_DBG_PRINTF   
   #undef RALLOC_UINTPTR_TYPE   
      
      
      
      
   /**************************************************************/   
   # if defined (__cplusplus)   
    }   
   # endif   
   #endif   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|