/* automatically generated by memory-auto.sh, do not edit! */

/*
 * Copyright (c) 2005, 2006 Matt Fredette
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Matt Fredette.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/* includes: */
#include <tme/memory.h>


_TME_RCSID("$Id: memory-auto.sh,v 1.2 2010/02/15 15:16:28 fredette Exp $");

/* undefine the macro version of tme_memory_bus_read16: */
#undef tme_memory_bus_read16

/* the bus 16-bit read slow function: */
tme_uint16_t
tme_memory_bus_read16(_tme_const tme_shared tme_uint16_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
{
  const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
  unsigned int size_skip;
  unsigned int size_done;
  tme_uint16_t x;
#ifdef TME_HAVE_INT64_T
  _tme_const tme_shared tme_uint64_t *parts64;
  tme_uint64_t part64;
#endif /* TME_HAVE_INT64_T */
  _tme_const tme_shared tme_uint32_t *parts32;
  tme_uint32_t part32;
  _tme_const tme_shared tme_uint16_t *parts16;
  tme_uint16_t part16;
  _tme_const tme_shared tme_uint8_t *parts8;
  tme_uint8_t part8;

  assert (bus_boundary != 0 && bus_boundary <= host_boundary);


#ifdef TME_HAVE_INT64_T

 
 if (host_boundary == sizeof(tme_uint64_t))
 {

    /* prepare to read the first 64-bit part of the memory: */
    parts64 = (_tme_const tme_shared tme_uint64_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (64 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (64 / 8)) * 8;
    size_done = 0;

    /* read the first 64-bit part of the memory: */
    part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint16_t) (part64 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((part64 << size_skip) >> ((64 - 16) + 0));
    }
    size_done = 64 - size_skip;

    /* read at most one remaining 64-bit part of the memory: */
    if (__tme_predict_false(size_done < 16)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 64-bit part of the memory: */
      parts64++;
      part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint16_t) (part64 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((part64 << 0) >> ((64 - 16) + size_done));
      }
    }
  }

  else

#endif /* TME_HAVE_INT64_T */

  if (host_boundary == sizeof(tme_uint32_t))
 {

    /* prepare to read the first 32-bit part of the memory: */
    parts32 = (_tme_const tme_shared tme_uint32_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (32 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (32 / 8)) * 8;
    size_done = 0;

    /* read the first 32-bit part of the memory: */
    part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint16_t) (part32 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((part32 << size_skip) >> ((32 - 16) + 0));
    }
    size_done = 32 - size_skip;

    /* read at most one remaining 32-bit part of the memory: */
    if (__tme_predict_false(size_done < 16)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 32-bit part of the memory: */
      parts32++;
      part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint16_t) (part32 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((part32 << 0) >> ((32 - 16) + size_done));
      }
    }
  }

  else if (host_boundary == sizeof(tme_uint16_t))
 {

    /* prepare to read the first 16-bit part of the memory: */
    parts16 = (_tme_const tme_shared tme_uint16_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (16 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (16 / 8)) * 8;
    size_done = 0;

    /* read the first 16-bit part of the memory: */
    part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint16_t) (part16 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint16_t) part16) << ((16 - 16) + size_skip)) >> 0);
    }
    size_done = 16 - size_skip;

    /* read at most one remaining 16-bit part of the memory: */
    if (__tme_predict_false(size_done < 16)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 16-bit part of the memory: */
      parts16++;
      part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint16_t) (part16 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint16_t) part16) << ((16 - 16) + 0)) >> size_done);
      }
    }
  }

  else {

    /* prepare to read the first 8-bit part of the memory: */
    parts8 = (_tme_const tme_shared tme_uint8_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (8 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (8 / 8)) * 8;
    size_done = 0;

    /* read the first 8-bit part of the memory: */
    part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint16_t) (part8 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint16_t) part8) << ((16 - 8) + size_skip)) >> 0);
    }
    size_done = 8 - size_skip;

    /* read at most one remaining 8-bit part of the memory: */
    if (__tme_predict_false(size_done < 16)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 8-bit part of the memory: */
      parts8++;
      part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint16_t) (part8 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint16_t) part8) << ((16 - 8) + 0)) >> size_done);
      }
    }
  }

  /* return the value read: */
  return (x);
}

/* undefine the macro version of tme_memory_bus_write16: */
#undef tme_memory_bus_write16

/* the bus 16-bit write slow function: */
void
tme_memory_bus_write16(tme_shared tme_uint16_t *mem, tme_uint16_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
{
  const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
  unsigned int size_skip;
  unsigned int size_done;
#ifdef TME_HAVE_INT64_T
  tme_shared tme_uint64_t *parts64;
  tme_uint64_t part64;
  tme_uint64_t part64_cmp;
#endif /* TME_HAVE_INT64_T */
  tme_shared tme_uint32_t *parts32;
  tme_uint32_t part32;
  tme_uint32_t part32_cmp;
  tme_shared tme_uint16_t *parts16;
  tme_uint16_t part16;
  tme_uint16_t part16_cmp;
  tme_shared tme_uint8_t *parts8;
  tme_uint8_t part8;
  tme_uint8_t part8_cmp;

  assert (bus_boundary != 0 && bus_boundary <= host_boundary);


#ifdef TME_HAVE_INT64_T

 
 if (host_boundary == sizeof(tme_uint64_t))
 {

    /* prepare to write the first 64-bit part of the memory: */
    parts64 = (tme_shared tme_uint64_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (64 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (64 / 8)) * 8;
    size_done = 0;

    /* write the first 64-bit part of the memory: */
    part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
    do {
      part64_cmp = part64;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
        part64 |= (((tme_uint64_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((64 - 16) + 0)) >> size_skip);
        part64 |= ((((tme_uint64_t) x) << (64 - 16)) >> size_skip);
      }

      /* loop until we can atomically update this part: */
      part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
    } while (part64 != part64_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (64 - size_skip);
    }
    else {
      x <<= (64 - size_skip);
    }
    size_done = 64 - size_skip;

    /* write at most one remaining 64-bit part of the memory: */
    if (__tme_predict_false(size_done < 16)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 64-bit part of the memory: */
      parts64++;
      part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
      do {
        part64_cmp = part64;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
          part64 |= (((tme_uint64_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((64 - 16) + size_done)) >> 0);
          part64 |= ((((tme_uint64_t) x) << (64 - 16)) >> 0);
        }

        /* loop until we can atomically update this part: */
        part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
      } while (part64 != part64_cmp);
    }
  }

  else

#endif /* TME_HAVE_INT64_T */

  if (host_boundary == sizeof(tme_uint32_t))
 {

    /* prepare to write the first 32-bit part of the memory: */
    parts32 = (tme_shared tme_uint32_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (32 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (32 / 8)) * 8;
    size_done = 0;

    /* write the first 32-bit part of the memory: */
    part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
    do {
      part32_cmp = part32;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
        part32 |= (((tme_uint32_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part32 &= ~((((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((32 - 16) + 0)) >> size_skip);
        part32 |= ((((tme_uint32_t) x) << (32 - 16)) >> size_skip);
      }

      /* loop until we can atomically update this part: */
      part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
    } while (part32 != part32_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (32 - size_skip);
    }
    else {
      x <<= (32 - size_skip);
    }
    size_done = 32 - size_skip;

    /* write at most one remaining 32-bit part of the memory: */
    if (__tme_predict_false(size_done < 16)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 32-bit part of the memory: */
      parts32++;
      part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
      do {
        part32_cmp = part32;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
          part32 |= (((tme_uint32_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part32 &= ~((((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((32 - 16) + size_done)) >> 0);
          part32 |= ((((tme_uint32_t) x) << (32 - 16)) >> 0);
        }

        /* loop until we can atomically update this part: */
        part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
      } while (part32 != part32_cmp);
    }
  }

  else if (host_boundary == sizeof(tme_uint16_t))
 {

    /* prepare to write the first 16-bit part of the memory: */
    parts16 = (tme_shared tme_uint16_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (16 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (16 / 8)) * 8;
    size_done = 0;

    /* write the first 16-bit part of the memory: */
    part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
    do {
      part16_cmp = part16;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
        part16 |= (((tme_uint16_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
        part16 |= (x >> ((16 - 16) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
    } while (part16 != part16_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (16 - size_skip);
    }
    else {
      x <<= (16 - size_skip);
    }
    size_done = 16 - size_skip;

    /* write at most one remaining 16-bit part of the memory: */
    if (__tme_predict_false(size_done < 16)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 16-bit part of the memory: */
      parts16++;
      part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
      do {
        part16_cmp = part16;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
          part16 |= (((tme_uint16_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
          part16 |= (x >> ((16 - 16) + 0));
        }

        /* loop until we can atomically update this part: */
        part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
      } while (part16 != part16_cmp);
    }
  }

  else {

    /* prepare to write the first 8-bit part of the memory: */
    parts8 = (tme_shared tme_uint8_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (8 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (8 / 8)) * 8;
    size_done = 0;

    /* write the first 8-bit part of the memory: */
    part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
    do {
      part8_cmp = part8;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
        part8 |= (((tme_uint8_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
        part8 |= (x >> ((16 - 8) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
    } while (part8 != part8_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (8 - size_skip);
    }
    else {
      x <<= (8 - size_skip);
    }
    size_done = 8 - size_skip;

    /* write at most one remaining 8-bit part of the memory: */
    if (__tme_predict_false(size_done < 16)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 8-bit part of the memory: */
      parts8++;
      part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
      do {
        part8_cmp = part8;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
          part8 |= (((tme_uint8_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
          part8 |= (x >> ((16 - 8) + 0));
        }

        /* loop until we can atomically update this part: */
        part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
      } while (part8 != part8_cmp);
    }
  }
}

/* undefine the macro version of tme_memory_bus_read32: */
#undef tme_memory_bus_read32

/* the bus 32-bit read slow function: */
tme_uint32_t
tme_memory_bus_read32(_tme_const tme_shared tme_uint32_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
{
  const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
  unsigned int size_skip;
  unsigned int size_done;
  tme_uint32_t x;
#ifdef TME_HAVE_INT64_T
  _tme_const tme_shared tme_uint64_t *parts64;
  tme_uint64_t part64;
#endif /* TME_HAVE_INT64_T */
  _tme_const tme_shared tme_uint32_t *parts32;
  tme_uint32_t part32;
  _tme_const tme_shared tme_uint16_t *parts16;
  tme_uint16_t part16;
  _tme_const tme_shared tme_uint8_t *parts8;
  tme_uint8_t part8;

  assert (bus_boundary != 0 && bus_boundary <= host_boundary);


#ifdef TME_HAVE_INT64_T

 
 if (host_boundary == sizeof(tme_uint64_t))
 {

    /* prepare to read the first 64-bit part of the memory: */
    parts64 = (_tme_const tme_shared tme_uint64_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (64 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (64 / 8)) * 8;
    size_done = 0;

    /* read the first 64-bit part of the memory: */
    part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint32_t) (part64 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((part64 << size_skip) >> ((64 - 32) + 0));
    }
    size_done = 64 - size_skip;

    /* read at most one remaining 64-bit part of the memory: */
    if (__tme_predict_false(size_done < 32)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 64-bit part of the memory: */
      parts64++;
      part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint32_t) (part64 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((part64 << 0) >> ((64 - 32) + size_done));
      }
    }
  }

  else

#endif /* TME_HAVE_INT64_T */

  if (host_boundary == sizeof(tme_uint32_t))
 {

    /* prepare to read the first 32-bit part of the memory: */
    parts32 = (_tme_const tme_shared tme_uint32_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (32 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (32 / 8)) * 8;
    size_done = 0;

    /* read the first 32-bit part of the memory: */
    part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint32_t) (part32 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint32_t) part32) << ((32 - 32) + size_skip)) >> 0);
    }
    size_done = 32 - size_skip;

    /* read at most one remaining 32-bit part of the memory: */
    if (__tme_predict_false(size_done < 32)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 32-bit part of the memory: */
      parts32++;
      part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint32_t) (part32 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint32_t) part32) << ((32 - 32) + 0)) >> size_done);
      }
    }
  }

  else if (host_boundary == sizeof(tme_uint16_t))
 {

    /* prepare to read the first 16-bit part of the memory: */
    parts16 = (_tme_const tme_shared tme_uint16_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (16 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (16 / 8)) * 8;
    size_done = 0;

    /* read the first 16-bit part of the memory: */
    part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint32_t) (part16 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint32_t) part16) << ((32 - 16) + size_skip)) >> 0);
    }
    size_done = 16 - size_skip;

    /* read any remaining 16-bit parts of the memory: */
    for (; size_done < 32; size_done += 16) {

      /* make a boundary: */
      tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 16-bit part of the memory: */
      parts16++;
      part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint32_t) (part16 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint32_t) part16) << ((32 - 16) + 0)) >> size_done);
      }
    }
  }

  else {

    /* prepare to read the first 8-bit part of the memory: */
    parts8 = (_tme_const tme_shared tme_uint8_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (8 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (8 / 8)) * 8;
    size_done = 0;

    /* read the first 8-bit part of the memory: */
    part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint32_t) (part8 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint32_t) part8) << ((32 - 8) + size_skip)) >> 0);
    }
    size_done = 8 - size_skip;

    /* read any remaining 8-bit parts of the memory: */
    for (; size_done < 32; size_done += 8) {

      /* make a boundary: */
      tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 8-bit part of the memory: */
      parts8++;
      part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint32_t) (part8 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint32_t) part8) << ((32 - 8) + 0)) >> size_done);
      }
    }
  }

  /* return the value read: */
  return (x);
}

/* undefine the macro version of tme_memory_bus_write32: */
#undef tme_memory_bus_write32

/* the bus 32-bit write slow function: */
void
tme_memory_bus_write32(tme_shared tme_uint32_t *mem, tme_uint32_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
{
  const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
  unsigned int size_skip;
  unsigned int size_done;
#ifdef TME_HAVE_INT64_T
  tme_shared tme_uint64_t *parts64;
  tme_uint64_t part64;
  tme_uint64_t part64_cmp;
#endif /* TME_HAVE_INT64_T */
  tme_shared tme_uint32_t *parts32;
  tme_uint32_t part32;
  tme_uint32_t part32_cmp;
  tme_shared tme_uint16_t *parts16;
  tme_uint16_t part16;
  tme_uint16_t part16_cmp;
  tme_shared tme_uint8_t *parts8;
  tme_uint8_t part8;
  tme_uint8_t part8_cmp;

  assert (bus_boundary != 0 && bus_boundary <= host_boundary);


#ifdef TME_HAVE_INT64_T

 
 if (host_boundary == sizeof(tme_uint64_t))
 {

    /* prepare to write the first 64-bit part of the memory: */
    parts64 = (tme_shared tme_uint64_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (64 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (64 / 8)) * 8;
    size_done = 0;

    /* write the first 64-bit part of the memory: */
    part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
    do {
      part64_cmp = part64;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
        part64 |= (((tme_uint64_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, + 0)) << ((64 - 32) + 0)) >> size_skip);
        part64 |= ((((tme_uint64_t) x) << (64 - 32)) >> size_skip);
      }

      /* loop until we can atomically update this part: */
      part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
    } while (part64 != part64_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (64 - size_skip);
    }
    else {
      x <<= (64 - size_skip);
    }
    size_done = 64 - size_skip;

    /* write at most one remaining 64-bit part of the memory: */
    if (__tme_predict_false(size_done < 32)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 64-bit part of the memory: */
      parts64++;
      part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
      do {
        part64_cmp = part64;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
          part64 |= (((tme_uint64_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, + 0)) << ((64 - 32) + size_done)) >> 0);
          part64 |= ((((tme_uint64_t) x) << (64 - 32)) >> 0);
        }

        /* loop until we can atomically update this part: */
        part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
      } while (part64 != part64_cmp);
    }
  }

  else

#endif /* TME_HAVE_INT64_T */

  if (host_boundary == sizeof(tme_uint32_t))
 {

    /* prepare to write the first 32-bit part of the memory: */
    parts32 = (tme_shared tme_uint32_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (32 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (32 / 8)) * 8;
    size_done = 0;

    /* write the first 32-bit part of the memory: */
    part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
    do {
      part32_cmp = part32;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
        part32 |= (((tme_uint32_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << 0) >> size_skip);
        part32 |= (x >> ((32 - 32) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
    } while (part32 != part32_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (32 - size_skip);
    }
    else {
      x <<= (32 - size_skip);
    }
    size_done = 32 - size_skip;

    /* write at most one remaining 32-bit part of the memory: */
    if (__tme_predict_false(size_done < 32)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 32-bit part of the memory: */
      parts32++;
      part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
      do {
        part32_cmp = part32;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
          part32 |= (((tme_uint32_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << size_done) >> 0);
          part32 |= (x >> ((32 - 32) + 0));
        }

        /* loop until we can atomically update this part: */
        part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
      } while (part32 != part32_cmp);
    }
  }

  else if (host_boundary == sizeof(tme_uint16_t))
 {

    /* prepare to write the first 16-bit part of the memory: */
    parts16 = (tme_shared tme_uint16_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (16 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (16 / 8)) * 8;
    size_done = 0;

    /* write the first 16-bit part of the memory: */
    part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
    do {
      part16_cmp = part16;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
        part16 |= (((tme_uint16_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
        part16 |= (x >> ((32 - 16) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
    } while (part16 != part16_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (16 - size_skip);
    }
    else {
      x <<= (16 - size_skip);
    }
    size_done = 16 - size_skip;

    /* try to write one full 16-bit part of memory: */
    if (__tme_predict_true(size_done <= (32 - 16))) {

      /* make a boundary: */
      tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write a full 16-bit part of memory: */
      part16 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (32 - 16)));
      parts16++;
      tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
      size_done += 16;
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= 16;
      }
      else {
        x <<= 16;
      }
    }

    /* write at most one remaining 16-bit part of the memory: */
    if (__tme_predict_false(size_done < 32)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 16-bit part of the memory: */
      parts16++;
      part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
      do {
        part16_cmp = part16;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
          part16 |= (((tme_uint16_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
          part16 |= (x >> ((32 - 16) + 0));
        }

        /* loop until we can atomically update this part: */
        part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
      } while (part16 != part16_cmp);
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= (16 - 0);
      }
      else {
        x <<= (16 - 0);
      }
    }
  }

  else {

    /* prepare to write the first 8-bit part of the memory: */
    parts8 = (tme_shared tme_uint8_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (8 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (8 / 8)) * 8;
    size_done = 0;

    /* write the first 8-bit part of the memory: */
    part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
    do {
      part8_cmp = part8;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
        part8 |= (((tme_uint8_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
        part8 |= (x >> ((32 - 8) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
    } while (part8 != part8_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (8 - size_skip);
    }
    else {
      x <<= (8 - size_skip);
    }
    size_done = 8 - size_skip;

    /* write as many full 8-bit parts of the memory as we can: */
    for (; size_done <= (32 - 8); ) {

      /* make a boundary: */
      tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write a full 8-bit part of memory: */
      part8 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (32 - 8)));
      parts8++;
      tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
      size_done += 8;
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= 8;
      }
      else {
        x <<= 8;
      }
    }

    /* write at most one remaining 8-bit part of the memory: */
    if (__tme_predict_false(size_done < 32)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 8-bit part of the memory: */
      parts8++;
      part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
      do {
        part8_cmp = part8;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
          part8 |= (((tme_uint8_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
          part8 |= (x >> ((32 - 8) + 0));
        }

        /* loop until we can atomically update this part: */
        part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
      } while (part8 != part8_cmp);
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= (8 - 0);
      }
      else {
        x <<= (8 - 0);
      }
    }
  }
}

#ifdef TME_HAVE_INT64_T

/* undefine the macro version of tme_memory_bus_read64: */
#undef tme_memory_bus_read64

/* the bus 64-bit read slow function: */
tme_uint64_t
tme_memory_bus_read64(_tme_const tme_shared tme_uint64_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
{
  const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
  unsigned int size_skip;
  unsigned int size_done;
  tme_uint64_t x;
#ifdef TME_HAVE_INT64_T
  _tme_const tme_shared tme_uint64_t *parts64;
  tme_uint64_t part64;
#endif /* TME_HAVE_INT64_T */
  _tme_const tme_shared tme_uint32_t *parts32;
  tme_uint32_t part32;
  _tme_const tme_shared tme_uint16_t *parts16;
  tme_uint16_t part16;
  _tme_const tme_shared tme_uint8_t *parts8;
  tme_uint8_t part8;

  assert (bus_boundary != 0 && bus_boundary <= host_boundary);


#ifdef TME_HAVE_INT64_T

 
 if (host_boundary == sizeof(tme_uint64_t))
 {

    /* prepare to read the first 64-bit part of the memory: */
    parts64 = (_tme_const tme_shared tme_uint64_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (64 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (64 / 8)) * 8;
    size_done = 0;

    /* read the first 64-bit part of the memory: */
    part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint64_t) (part64 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint64_t) part64) << ((64 - 64) + size_skip)) >> 0);
    }
    size_done = 64 - size_skip;

    /* read at most one remaining 64-bit part of the memory: */
    if (__tme_predict_false(size_done < 64)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 64-bit part of the memory: */
      parts64++;
      part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint64_t) (part64 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint64_t) part64) << ((64 - 64) + 0)) >> size_done);
      }
    }
  }

  else

#endif /* TME_HAVE_INT64_T */

  if (host_boundary == sizeof(tme_uint32_t))
 {

    /* prepare to read the first 32-bit part of the memory: */
    parts32 = (_tme_const tme_shared tme_uint32_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (32 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (32 / 8)) * 8;
    size_done = 0;

    /* read the first 32-bit part of the memory: */
    part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint64_t) (part32 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint64_t) part32) << ((64 - 32) + size_skip)) >> 0);
    }
    size_done = 32 - size_skip;

    /* read any remaining 32-bit parts of the memory: */
    for (; size_done < 64; size_done += 32) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 32-bit part of the memory: */
      parts32++;
      part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint64_t) (part32 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint64_t) part32) << ((64 - 32) + 0)) >> size_done);
      }
    }
  }

  else if (host_boundary == sizeof(tme_uint16_t))
 {

    /* prepare to read the first 16-bit part of the memory: */
    parts16 = (_tme_const tme_shared tme_uint16_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (16 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (16 / 8)) * 8;
    size_done = 0;

    /* read the first 16-bit part of the memory: */
    part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint64_t) (part16 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint64_t) part16) << ((64 - 16) + size_skip)) >> 0);
    }
    size_done = 16 - size_skip;

    /* read any remaining 16-bit parts of the memory: */
    for (; size_done < 64; size_done += 16) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 16-bit part of the memory: */
      parts16++;
      part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint64_t) (part16 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint64_t) part16) << ((64 - 16) + 0)) >> size_done);
      }
    }
  }

  else {

    /* prepare to read the first 8-bit part of the memory: */
    parts8 = (_tme_const tme_shared tme_uint8_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (8 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (8 / 8)) * 8;
    size_done = 0;

    /* read the first 8-bit part of the memory: */
    part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint64_t) (part8 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint64_t) part8) << ((64 - 8) + size_skip)) >> 0);
    }
    size_done = 8 - size_skip;

    /* read any remaining 8-bit parts of the memory: */
    for (; size_done < 64; size_done += 8) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 8-bit part of the memory: */
      parts8++;
      part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint64_t) (part8 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint64_t) part8) << ((64 - 8) + 0)) >> size_done);
      }
    }
  }

  /* return the value read: */
  return (x);
}

/* undefine the macro version of tme_memory_bus_write64: */
#undef tme_memory_bus_write64

/* the bus 64-bit write slow function: */
void
tme_memory_bus_write64(tme_shared tme_uint64_t *mem, tme_uint64_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
{
  const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
  unsigned int size_skip;
  unsigned int size_done;
#ifdef TME_HAVE_INT64_T
  tme_shared tme_uint64_t *parts64;
  tme_uint64_t part64;
  tme_uint64_t part64_cmp;
#endif /* TME_HAVE_INT64_T */
  tme_shared tme_uint32_t *parts32;
  tme_uint32_t part32;
  tme_uint32_t part32_cmp;
  tme_shared tme_uint16_t *parts16;
  tme_uint16_t part16;
  tme_uint16_t part16_cmp;
  tme_shared tme_uint8_t *parts8;
  tme_uint8_t part8;
  tme_uint8_t part8_cmp;

  assert (bus_boundary != 0 && bus_boundary <= host_boundary);


#ifdef TME_HAVE_INT64_T

 
 if (host_boundary == sizeof(tme_uint64_t))
 {

    /* prepare to write the first 64-bit part of the memory: */
    parts64 = (tme_shared tme_uint64_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (64 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (64 / 8)) * 8;
    size_done = 0;

    /* write the first 64-bit part of the memory: */
    part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
    do {
      part64_cmp = part64;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
        part64 |= (((tme_uint64_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part64 &= ~(_tme_memory_type_mask(tme_uint64_t, << 0) >> size_skip);
        part64 |= (x >> ((64 - 64) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
    } while (part64 != part64_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (64 - size_skip);
    }
    else {
      x <<= (64 - size_skip);
    }
    size_done = 64 - size_skip;

    /* write at most one remaining 64-bit part of the memory: */
    if (__tme_predict_false(size_done < 64)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 64-bit part of the memory: */
      parts64++;
      part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
      do {
        part64_cmp = part64;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
          part64 |= (((tme_uint64_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part64 &= ~(_tme_memory_type_mask(tme_uint64_t, << size_done) >> 0);
          part64 |= (x >> ((64 - 64) + 0));
        }

        /* loop until we can atomically update this part: */
        part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
      } while (part64 != part64_cmp);
    }
  }

  else

#endif /* TME_HAVE_INT64_T */

  if (host_boundary == sizeof(tme_uint32_t))
 {

    /* prepare to write the first 32-bit part of the memory: */
    parts32 = (tme_shared tme_uint32_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (32 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (32 / 8)) * 8;
    size_done = 0;

    /* write the first 32-bit part of the memory: */
    part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
    do {
      part32_cmp = part32;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
        part32 |= (((tme_uint32_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << 0) >> size_skip);
        part32 |= (x >> ((64 - 32) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
    } while (part32 != part32_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (32 - size_skip);
    }
    else {
      x <<= (32 - size_skip);
    }
    size_done = 32 - size_skip;

    /* try to write one full 32-bit part of memory: */
    if (__tme_predict_true(size_done <= (64 - 32))) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write a full 32-bit part of memory: */
      part32 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 32)));
      parts32++;
      tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
      size_done += 32;
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= 32;
      }
      else {
        x <<= 32;
      }
    }

    /* write at most one remaining 32-bit part of the memory: */
    if (__tme_predict_false(size_done < 64)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 32-bit part of the memory: */
      parts32++;
      part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
      do {
        part32_cmp = part32;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
          part32 |= (((tme_uint32_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << size_done) >> 0);
          part32 |= (x >> ((64 - 32) + 0));
        }

        /* loop until we can atomically update this part: */
        part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
      } while (part32 != part32_cmp);
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= (32 - 0);
      }
      else {
        x <<= (32 - 0);
      }
    }
  }

  else if (host_boundary == sizeof(tme_uint16_t))
 {

    /* prepare to write the first 16-bit part of the memory: */
    parts16 = (tme_shared tme_uint16_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (16 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (16 / 8)) * 8;
    size_done = 0;

    /* write the first 16-bit part of the memory: */
    part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
    do {
      part16_cmp = part16;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
        part16 |= (((tme_uint16_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
        part16 |= (x >> ((64 - 16) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
    } while (part16 != part16_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (16 - size_skip);
    }
    else {
      x <<= (16 - size_skip);
    }
    size_done = 16 - size_skip;

    /* write as many full 16-bit parts of the memory as we can: */
    for (; size_done <= (64 - 16); ) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write a full 16-bit part of memory: */
      part16 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 16)));
      parts16++;
      tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
      size_done += 16;
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= 16;
      }
      else {
        x <<= 16;
      }
    }

    /* write at most one remaining 16-bit part of the memory: */
    if (__tme_predict_false(size_done < 64)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 16-bit part of the memory: */
      parts16++;
      part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
      do {
        part16_cmp = part16;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
          part16 |= (((tme_uint16_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
          part16 |= (x >> ((64 - 16) + 0));
        }

        /* loop until we can atomically update this part: */
        part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
      } while (part16 != part16_cmp);
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= (16 - 0);
      }
      else {
        x <<= (16 - 0);
      }
    }
  }

  else {

    /* prepare to write the first 8-bit part of the memory: */
    parts8 = (tme_shared tme_uint8_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (8 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (8 / 8)) * 8;
    size_done = 0;

    /* write the first 8-bit part of the memory: */
    part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
    do {
      part8_cmp = part8;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
        part8 |= (((tme_uint8_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
        part8 |= (x >> ((64 - 8) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
    } while (part8 != part8_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (8 - size_skip);
    }
    else {
      x <<= (8 - size_skip);
    }
    size_done = 8 - size_skip;

    /* write as many full 8-bit parts of the memory as we can: */
    for (; size_done <= (64 - 8); ) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write a full 8-bit part of memory: */
      part8 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 8)));
      parts8++;
      tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
      size_done += 8;
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= 8;
      }
      else {
        x <<= 8;
      }
    }

    /* write at most one remaining 8-bit part of the memory: */
    if (__tme_predict_false(size_done < 64)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 8-bit part of the memory: */
      parts8++;
      part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
      do {
        part8_cmp = part8;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
          part8 |= (((tme_uint8_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
          part8 |= (x >> ((64 - 8) + 0));
        }

        /* loop until we can atomically update this part: */
        part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
      } while (part8 != part8_cmp);
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= (8 - 0);
      }
      else {
        x <<= (8 - 0);
      }
    }
  }
}

#endif /* TME_HAVE_INT64_T */

#ifdef TME_HAVE_INT128_T

/* undefine the macro version of tme_memory_bus_read128: */
#undef tme_memory_bus_read128

/* the bus 128-bit read slow function: */
tme_uint128_t
tme_memory_bus_read128(_tme_const tme_shared tme_uint128_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
{
  const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
  unsigned int size_skip;
  unsigned int size_done;
  tme_uint128_t x;
#ifdef TME_HAVE_INT64_T
  _tme_const tme_shared tme_uint64_t *parts64;
  tme_uint64_t part64;
#endif /* TME_HAVE_INT64_T */
  _tme_const tme_shared tme_uint32_t *parts32;
  tme_uint32_t part32;
  _tme_const tme_shared tme_uint16_t *parts16;
  tme_uint16_t part16;
  _tme_const tme_shared tme_uint8_t *parts8;
  tme_uint8_t part8;

  assert (bus_boundary != 0 && bus_boundary <= host_boundary);


#ifdef TME_HAVE_INT64_T

 
 if (host_boundary == sizeof(tme_uint64_t))
 {

    /* prepare to read the first 64-bit part of the memory: */
    parts64 = (_tme_const tme_shared tme_uint64_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (64 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (64 / 8)) * 8;
    size_done = 0;

    /* read the first 64-bit part of the memory: */
    part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint128_t) (part64 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint128_t) part64) << ((128 - 64) + size_skip)) >> 0);
    }
    size_done = 64 - size_skip;

    /* read any remaining 64-bit parts of the memory: */
    for (; size_done < 128; size_done += 64) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 64-bit part of the memory: */
      parts64++;
      part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint128_t) (part64 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint128_t) part64) << ((128 - 64) + 0)) >> size_done);
      }
    }
  }

  else

#endif /* TME_HAVE_INT64_T */

  if (host_boundary == sizeof(tme_uint32_t))
 {

    /* prepare to read the first 32-bit part of the memory: */
    parts32 = (_tme_const tme_shared tme_uint32_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (32 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (32 / 8)) * 8;
    size_done = 0;

    /* read the first 32-bit part of the memory: */
    part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint128_t) (part32 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint128_t) part32) << ((128 - 32) + size_skip)) >> 0);
    }
    size_done = 32 - size_skip;

    /* read any remaining 32-bit parts of the memory: */
    for (; size_done < 128; size_done += 32) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 32-bit part of the memory: */
      parts32++;
      part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint128_t) (part32 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint128_t) part32) << ((128 - 32) + 0)) >> size_done);
      }
    }
  }

  else if (host_boundary == sizeof(tme_uint16_t))
 {

    /* prepare to read the first 16-bit part of the memory: */
    parts16 = (_tme_const tme_shared tme_uint16_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (16 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (16 / 8)) * 8;
    size_done = 0;

    /* read the first 16-bit part of the memory: */
    part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint128_t) (part16 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint128_t) part16) << ((128 - 16) + size_skip)) >> 0);
    }
    size_done = 16 - size_skip;

    /* read any remaining 16-bit parts of the memory: */
    for (; size_done < 128; size_done += 16) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 16-bit part of the memory: */
      parts16++;
      part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint128_t) (part16 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint128_t) part16) << ((128 - 16) + 0)) >> size_done);
      }
    }
  }

  else {

    /* prepare to read the first 8-bit part of the memory: */
    parts8 = (_tme_const tme_shared tme_uint8_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (8 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (8 / 8)) * 8;
    size_done = 0;

    /* read the first 8-bit part of the memory: */
    part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));

    /* on a little-endian host, we shift off the skip
       data on the right, and shift the remaining data
       up into position in the result: */
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x = (((tme_uint128_t) (part8 >> size_skip)) << 0);
    }

    /* on a big-endian host, we shift off the skip data
       on the left, and shift the remaining data down
       into position in the result: */
    else {
      x = 
((((tme_uint128_t) part8) << ((128 - 8) + size_skip)) >> 0);
    }
    size_done = 8 - size_skip;

    /* read any remaining 8-bit parts of the memory: */
    for (; size_done < 128; size_done += 8) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);

      /* read the next 8-bit part of the memory: */
      parts8++;
      part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));

      /* on a little-endian host, we shift off the skip
         data on the right, and shift the remaining data
         up into position in the result: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x |= (((tme_uint128_t) (part8 >> 0)) << size_done);
      }

      /* on a big-endian host, we shift off the skip data
         on the left, and shift the remaining data down
         into position in the result: */
      else {
        x |= 
((((tme_uint128_t) part8) << ((128 - 8) + 0)) >> size_done);
      }
    }
  }

  /* return the value read: */
  return (x);
}

/* undefine the macro version of tme_memory_bus_write128: */
#undef tme_memory_bus_write128

/* the bus 128-bit write slow function: */
void
tme_memory_bus_write128(tme_shared tme_uint128_t *mem, tme_uint128_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
{
  const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
  unsigned int size_skip;
  unsigned int size_done;
#ifdef TME_HAVE_INT64_T
  tme_shared tme_uint64_t *parts64;
  tme_uint64_t part64;
  tme_uint64_t part64_cmp;
#endif /* TME_HAVE_INT64_T */
  tme_shared tme_uint32_t *parts32;
  tme_uint32_t part32;
  tme_uint32_t part32_cmp;
  tme_shared tme_uint16_t *parts16;
  tme_uint16_t part16;
  tme_uint16_t part16_cmp;
  tme_shared tme_uint8_t *parts8;
  tme_uint8_t part8;
  tme_uint8_t part8_cmp;

  assert (bus_boundary != 0 && bus_boundary <= host_boundary);


#ifdef TME_HAVE_INT64_T

 
 if (host_boundary == sizeof(tme_uint64_t))
 {

    /* prepare to write the first 64-bit part of the memory: */
    parts64 = (tme_shared tme_uint64_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (64 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (64 / 8)) * 8;
    size_done = 0;

    /* write the first 64-bit part of the memory: */
    part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
    do {
      part64_cmp = part64;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint128_t, << 0)) << size_skip));
        part64 |= (((tme_uint64_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part64 &= ~(_tme_memory_type_mask(tme_uint64_t, << 0) >> size_skip);
        part64 |= (x >> ((128 - 64) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
    } while (part64 != part64_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (64 - size_skip);
    }
    else {
      x <<= (64 - size_skip);
    }
    size_done = 64 - size_skip;

    /* try to write one full 64-bit part of memory: */
    if (__tme_predict_true(size_done <= (128 - 64))) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write a full 64-bit part of memory: */
      part64 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (128 - 64)));
      parts64++;
      tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
      size_done += 64;
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= 64;
      }
      else {
        x <<= 64;
      }
    }

    /* write at most one remaining 64-bit part of the memory: */
    if (__tme_predict_false(size_done < 128)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 64-bit part of the memory: */
      parts64++;
      part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
      do {
        part64_cmp = part64;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint128_t, << size_done)) << 0));
          part64 |= (((tme_uint64_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part64 &= ~(_tme_memory_type_mask(tme_uint64_t, << size_done) >> 0);
          part64 |= (x >> ((128 - 64) + 0));
        }

        /* loop until we can atomically update this part: */
        part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
      } while (part64 != part64_cmp);
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= (64 - 0);
      }
      else {
        x <<= (64 - 0);
      }
    }
  }

  else

#endif /* TME_HAVE_INT64_T */

  if (host_boundary == sizeof(tme_uint32_t))
 {

    /* prepare to write the first 32-bit part of the memory: */
    parts32 = (tme_shared tme_uint32_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (32 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (32 / 8)) * 8;
    size_done = 0;

    /* write the first 32-bit part of the memory: */
    part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
    do {
      part32_cmp = part32;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint128_t, << 0)) << size_skip));
        part32 |= (((tme_uint32_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << 0) >> size_skip);
        part32 |= (x >> ((128 - 32) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
    } while (part32 != part32_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (32 - size_skip);
    }
    else {
      x <<= (32 - size_skip);
    }
    size_done = 32 - size_skip;

    /* write as many full 32-bit parts of the memory as we can: */
    for (; size_done <= (128 - 32); ) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write a full 32-bit part of memory: */
      part32 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (128 - 32)));
      parts32++;
      tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
      size_done += 32;
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= 32;
      }
      else {
        x <<= 32;
      }
    }

    /* write at most one remaining 32-bit part of the memory: */
    if (__tme_predict_false(size_done < 128)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 32-bit part of the memory: */
      parts32++;
      part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
      do {
        part32_cmp = part32;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint128_t, << size_done)) << 0));
          part32 |= (((tme_uint32_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << size_done) >> 0);
          part32 |= (x >> ((128 - 32) + 0));
        }

        /* loop until we can atomically update this part: */
        part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
      } while (part32 != part32_cmp);
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= (32 - 0);
      }
      else {
        x <<= (32 - 0);
      }
    }
  }

  else if (host_boundary == sizeof(tme_uint16_t))
 {

    /* prepare to write the first 16-bit part of the memory: */
    parts16 = (tme_shared tme_uint16_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (16 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (16 / 8)) * 8;
    size_done = 0;

    /* write the first 16-bit part of the memory: */
    part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
    do {
      part16_cmp = part16;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint128_t, << 0)) << size_skip));
        part16 |= (((tme_uint16_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
        part16 |= (x >> ((128 - 16) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
    } while (part16 != part16_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (16 - size_skip);
    }
    else {
      x <<= (16 - size_skip);
    }
    size_done = 16 - size_skip;

    /* write as many full 16-bit parts of the memory as we can: */
    for (; size_done <= (128 - 16); ) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write a full 16-bit part of memory: */
      part16 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (128 - 16)));
      parts16++;
      tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
      size_done += 16;
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= 16;
      }
      else {
        x <<= 16;
      }
    }

    /* write at most one remaining 16-bit part of the memory: */
    if (__tme_predict_false(size_done < 128)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 16-bit part of the memory: */
      parts16++;
      part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
      do {
        part16_cmp = part16;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint128_t, << size_done)) << 0));
          part16 |= (((tme_uint16_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
          part16 |= (x >> ((128 - 16) + 0));
        }

        /* loop until we can atomically update this part: */
        part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
      } while (part16 != part16_cmp);
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= (16 - 0);
      }
      else {
        x <<= (16 - 0);
      }
    }
  }

  else {

    /* prepare to write the first 8-bit part of the memory: */
    parts8 = (tme_shared tme_uint8_t *) (((tme_uintptr_t) mem) & (((tme_uintptr_t) 0) - (8 / 8)));
    size_skip = (((unsigned int) (tme_uintptr_t) mem) % (8 / 8)) * 8;
    size_done = 0;

    /* write the first 8-bit part of the memory: */
    part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
    do {
      part8_cmp = part8;

      /* on a little-endian host, we clear with zeroes
         shifted up past the skip data, and then we
         insert the data shifted up past the skip data: */
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint128_t, << 0)) << size_skip));
        part8 |= (((tme_uint8_t) x) << size_skip);
      }

      /* on a big-endian host, we clear with zeroes
         shifted down past the skip data, and then we
         insert the data shifted down past the skip data: */
      else {
        part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
        part8 |= (x >> ((128 - 8) + size_skip));
      }

      /* loop until we can atomically update this part: */
      part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
    } while (part8 != part8_cmp);
    if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
      x >>= (8 - size_skip);
    }
    else {
      x <<= (8 - size_skip);
    }
    size_done = 8 - size_skip;

    /* write as many full 8-bit parts of the memory as we can: */
    for (; size_done <= (128 - 8); ) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write a full 8-bit part of memory: */
      part8 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (128 - 8)));
      parts8++;
      tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
      size_done += 8;
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= 8;
      }
      else {
        x <<= 8;
      }
    }

    /* write at most one remaining 8-bit part of the memory: */
    if (__tme_predict_false(size_done < 128)) {

      /* make a boundary: */
      tme_memory_barrier(mem, (128 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);

      /* write the next 8-bit part of the memory: */
      parts8++;
      part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
      do {
        part8_cmp = part8;

        /* on a little-endian host, we clear with zeroes
           shifted up past the skip data, and then we
           insert the data shifted up past the skip data: */
        if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
          part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint128_t, << size_done)) << 0));
          part8 |= (((tme_uint8_t) x) << 0);
        }

        /* on a big-endian host, we clear with zeroes
           shifted down past the skip data, and then we
           insert the data shifted down past the skip data: */
        else {
          part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
          part8 |= (x >> ((128 - 8) + 0));
        }

        /* loop until we can atomically update this part: */
        part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
      } while (part8 != part8_cmp);
      if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
        x >>= (8 - 0);
      }
      else {
        x <<= (8 - 0);
      }
    }
  }
}

#endif /* TME_HAVE_INT128_T */

/* undefine the macro version of tme_memory_bus_read_buffer: */
#undef tme_memory_bus_read_buffer

/* the bus read buffer function: */
void
tme_memory_bus_read_buffer(_tme_const tme_shared tme_uint8_t *mem, tme_uint8_t *buffer, unsigned long count, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
{
  const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
  _tme_const tme_uint8_t *part_buffer;
  unsigned int count_done;
  unsigned int count_misaligned;
  unsigned int bits_misaligned;
#ifdef TME_HAVE_INT64_T
  _tme_const tme_shared tme_uint64_t *parts64;
  tme_uint64_t part64_buffer;
  tme_uint64_t part64;
  tme_uint64_t part64_next;
#endif /* TME_HAVE_INT64_T */
  _tme_const tme_shared tme_uint32_t *parts32;
  tme_uint32_t part32_buffer;
  tme_uint32_t part32;
  tme_uint32_t part32_next;
  _tme_const tme_shared tme_uint16_t *parts16;
  tme_uint16_t part16_buffer;
  tme_uint16_t part16;
  tme_uint16_t part16_next;
  _tme_const tme_shared tme_uint8_t *parts8;
  tme_uint8_t part8_buffer;
  tme_uint8_t part8;
  tme_uint8_t part8_next;

  assert (count != 0);
  assert (bus_boundary != 0);

  /* if we are locking for all memory accesses, lock memory
     around a memcpy: */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    tme_rwlock_rdlock(rwlock);
    memcpy((buffer), ((_tme_const tme_uint8_t *) (mem)), (count));
    tme_rwlock_rdunlock(rwlock);
  }

  /* otherwise, if the emulated bus boundary is greater than the
     host's bus boundary, we are forced to stop all other threads
     around a memcpy: */
  else if (__tme_predict_false(bus_boundary == 0
                               || bus_boundary > host_boundary)) {
    tme_thread_suspend_others();
    memcpy((buffer), ((_tme_const tme_uint8_t *) (mem)), (count) + (0 && align_min));
    tme_thread_resume_others();
  }

#ifdef TME_HAVE_INT64_T

  else if (host_boundary == sizeof(tme_uint64_t)) {

    /* make a 64-bit pointer to the memory: */
    parts64 = (_tme_const tme_shared tme_uint64_t *) mem;

    /* if this pointer is not 64-bit aligned: */
    if (__tme_predict_false((((tme_uintptr_t) parts64) % sizeof(tme_uint64_t)) != 0)) {

      /* get the misalignment from the previous 64-bit boundary: */
      count_misaligned = ((tme_uintptr_t) parts64) % sizeof(tme_uint64_t);

      /* truncate this pointer to the previous 64-bit boundary: */
      parts64 = (_tme_const tme_shared tme_uint64_t *) (((tme_uintptr_t) parts64) & (((tme_uintptr_t) 0) - sizeof(tme_uint64_t)));

      /* get the number of bytes to read in the first 64-bit memory part: */
      count_done = sizeof(tme_uint64_t) - count_misaligned;
      if (__tme_predict_false(count_done > count)) {
        count_done = count;
      }

      /* read the first 64-bit memory part: */
      part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
      parts64++;

      /* copy to the buffer the bytes to read in the first
         64-bit memory part: */
      part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
      count -= count_done;
      do {
        *buffer = *part_buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);
    }

    /* if we have full 64-bit parts to read: */
    if (__tme_predict_true(count >= sizeof(tme_uint64_t))) {

      /* if the buffer is 64-bit aligned: */
      if (__tme_predict_true((((tme_uintptr_t) buffer) % sizeof(tme_uint64_t)) == 0)) {

        /* read full 64-bit parts without shifting: */
        do {
          part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
          tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));

          /* advance: */
          parts64++;
          buffer += sizeof(tme_uint64_t);
          count -= sizeof(tme_uint64_t);
        } while (count >= sizeof(tme_uint64_t));
      }

      /* otherwise, the buffer is not 64-bit aligned: */
      else {

        /* get the misalignment to the next 64-bit boundary: */
        count_misaligned = (sizeof(tme_uint64_t) - ((unsigned int) (tme_uintptr_t) buffer)) % sizeof(tme_uint64_t);

        /* read the next 64-bit memory part: */
        part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
        parts64++;

        /* copy to the buffer until it is aligned: */
        part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
        count_done = count_misaligned;
        count -= count_misaligned;
        do {
          *buffer = *part_buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);

        /* read full 64-bit words with shifting: */
        bits_misaligned = count_misaligned * 8;
        part64
          = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
             ? (part64_buffer >> bits_misaligned)
             : (part64_buffer << bits_misaligned));
        for (; count >= sizeof(tme_uint64_t); ) {
          part64_next = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
          if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
            part64 |= (part64_next << (64 - bits_misaligned));
            tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
            part64 = (part64_next >> bits_misaligned);
          }
          else {
            part64 |= (part64_next >> (64 - bits_misaligned));
            tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
            part64 = (part64_next << bits_misaligned);
          }

          /* advance: */
          parts64++;
          buffer += sizeof(tme_uint64_t);
          count -= sizeof(tme_uint64_t);
        }

        /* calculate how many more bytes there are to read in this
           64-bit memory part: */
        count_done = sizeof(tme_uint64_t) - count_misaligned;
        part64_buffer = part64;

        /* copy to the buffer the remaining bytes in this 64-bit part: */
        if (count_done > count) {
          count_done = count;
        }
        part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
        count -= count_done;
        do {
          *buffer = *part_buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);
      }
    }

    /* if we still have bytes to read: */
    if (__tme_predict_false(count > 0)) {

      /* we must have less than a full 64-bit part to read: */
      assert (count < sizeof(tme_uint64_t));

      /* read the last 64-bit memory part: */
      part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));

      /* copy to the buffer the bytes to read in the first
         64-bit memory part: */
      part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
      count_done = count;
      do {
        *buffer = *part_buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);
    }

  }

#endif /* TME_HAVE_INT64_T */

  else if (host_boundary == sizeof(tme_uint32_t)) {

    /* make a 32-bit pointer to the memory: */
    parts32 = (_tme_const tme_shared tme_uint32_t *) mem;

    /* if this pointer is not 32-bit aligned: */
    if (__tme_predict_false((((tme_uintptr_t) parts32) % sizeof(tme_uint32_t)) != 0)) {

      /* get the misalignment from the previous 32-bit boundary: */
      count_misaligned = ((tme_uintptr_t) parts32) % sizeof(tme_uint32_t);

      /* truncate this pointer to the previous 32-bit boundary: */
      parts32 = (_tme_const tme_shared tme_uint32_t *) (((tme_uintptr_t) parts32) & (((tme_uintptr_t) 0) - sizeof(tme_uint32_t)));

      /* get the number of bytes to read in the first 32-bit memory part: */
      count_done = sizeof(tme_uint32_t) - count_misaligned;
      if (__tme_predict_false(count_done > count)) {
        count_done = count;
      }

      /* read the first 32-bit memory part: */
      part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
      parts32++;

      /* copy to the buffer the bytes to read in the first
         32-bit memory part: */
      part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
      count -= count_done;
      do {
        *buffer = *part_buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);
    }

    /* if we have full 32-bit parts to read: */
    if (__tme_predict_true(count >= sizeof(tme_uint32_t))) {

      /* if the buffer is 32-bit aligned: */
      if (__tme_predict_true((((tme_uintptr_t) buffer) % sizeof(tme_uint32_t)) == 0)) {

        /* read full 32-bit parts without shifting: */
        do {
          part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
          tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));

          /* advance: */
          parts32++;
          buffer += sizeof(tme_uint32_t);
          count -= sizeof(tme_uint32_t);
        } while (count >= sizeof(tme_uint32_t));
      }

      /* otherwise, the buffer is not 32-bit aligned: */
      else {

        /* get the misalignment to the next 32-bit boundary: */
        count_misaligned = (sizeof(tme_uint32_t) - ((unsigned int) (tme_uintptr_t) buffer)) % sizeof(tme_uint32_t);

        /* read the next 32-bit memory part: */
        part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
        parts32++;

        /* copy to the buffer until it is aligned: */
        part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
        count_done = count_misaligned;
        count -= count_misaligned;
        do {
          *buffer = *part_buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);

        /* read full 32-bit words with shifting: */
        bits_misaligned = count_misaligned * 8;
        part32
          = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
             ? (part32_buffer >> bits_misaligned)
             : (part32_buffer << bits_misaligned));
        for (; count >= sizeof(tme_uint32_t); ) {
          part32_next = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
          if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
            part32 |= (part32_next << (32 - bits_misaligned));
            tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
            part32 = (part32_next >> bits_misaligned);
          }
          else {
            part32 |= (part32_next >> (32 - bits_misaligned));
            tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
            part32 = (part32_next << bits_misaligned);
          }

          /* advance: */
          parts32++;
          buffer += sizeof(tme_uint32_t);
          count -= sizeof(tme_uint32_t);
        }

        /* calculate how many more bytes there are to read in this
           32-bit memory part: */
        count_done = sizeof(tme_uint32_t) - count_misaligned;
        part32_buffer = part32;

        /* copy to the buffer the remaining bytes in this 32-bit part: */
        if (count_done > count) {
          count_done = count;
        }
        part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
        count -= count_done;
        do {
          *buffer = *part_buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);
      }
    }

    /* if we still have bytes to read: */
    if (__tme_predict_false(count > 0)) {

      /* we must have less than a full 32-bit part to read: */
      assert (count < sizeof(tme_uint32_t));

      /* read the last 32-bit memory part: */
      part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));

      /* copy to the buffer the bytes to read in the first
         32-bit memory part: */
      part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
      count_done = count;
      do {
        *buffer = *part_buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);
    }

  }

  else if (host_boundary == sizeof(tme_uint16_t)) {

    /* make a 16-bit pointer to the memory: */
    parts16 = (_tme_const tme_shared tme_uint16_t *) mem;

    /* if this pointer is not 16-bit aligned: */
    if (__tme_predict_false((((tme_uintptr_t) parts16) % sizeof(tme_uint16_t)) != 0)) {

      /* get the misalignment from the previous 16-bit boundary: */
      count_misaligned = ((tme_uintptr_t) parts16) % sizeof(tme_uint16_t);

      /* truncate this pointer to the previous 16-bit boundary: */
      parts16 = (_tme_const tme_shared tme_uint16_t *) (((tme_uintptr_t) parts16) & (((tme_uintptr_t) 0) - sizeof(tme_uint16_t)));

      /* get the number of bytes to read in the first 16-bit memory part: */
      count_done = sizeof(tme_uint16_t) - count_misaligned;
      if (__tme_predict_false(count_done > count)) {
        count_done = count;
      }

      /* read the first 16-bit memory part: */
      part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
      parts16++;

      /* copy to the buffer the bytes to read in the first
         16-bit memory part: */
      part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
      count -= count_done;
      do {
        *buffer = *part_buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);
    }

    /* if we have full 16-bit parts to read: */
    if (__tme_predict_true(count >= sizeof(tme_uint16_t))) {

      /* if the buffer is 16-bit aligned: */
      if (__tme_predict_true((((tme_uintptr_t) buffer) % sizeof(tme_uint16_t)) == 0)) {

        /* read full 16-bit parts without shifting: */
        do {
          part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
          tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));

          /* advance: */
          parts16++;
          buffer += sizeof(tme_uint16_t);
          count -= sizeof(tme_uint16_t);
        } while (count >= sizeof(tme_uint16_t));
      }

      /* otherwise, the buffer is not 16-bit aligned: */
      else {

        /* get the misalignment to the next 16-bit boundary: */
        count_misaligned = (sizeof(tme_uint16_t) - ((unsigned int) (tme_uintptr_t) buffer)) % sizeof(tme_uint16_t);

        /* read the next 16-bit memory part: */
        part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
        parts16++;

        /* copy to the buffer until it is aligned: */
        part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
        count_done = count_misaligned;
        count -= count_misaligned;
        do {
          *buffer = *part_buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);

        /* read full 16-bit words with shifting: */
        bits_misaligned = count_misaligned * 8;
        part16
          = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
             ? (part16_buffer >> bits_misaligned)
             : (part16_buffer << bits_misaligned));
        for (; count >= sizeof(tme_uint16_t); ) {
          part16_next = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
          if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
            part16 |= (part16_next << (16 - bits_misaligned));
            tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
            part16 = (part16_next >> bits_misaligned);
          }
          else {
            part16 |= (part16_next >> (16 - bits_misaligned));
            tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
            part16 = (part16_next << bits_misaligned);
          }

          /* advance: */
          parts16++;
          buffer += sizeof(tme_uint16_t);
          count -= sizeof(tme_uint16_t);
        }

        /* calculate how many more bytes there are to read in this
           16-bit memory part: */
        count_done = sizeof(tme_uint16_t) - count_misaligned;
        part16_buffer = part16;

        /* copy to the buffer the remaining bytes in this 16-bit part: */
        if (count_done > count) {
          count_done = count;
        }
        part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
        count -= count_done;
        do {
          *buffer = *part_buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);
      }
    }

    /* if we still have bytes to read: */
    if (__tme_predict_false(count > 0)) {

      /* we must have less than a full 16-bit part to read: */
      assert (count < sizeof(tme_uint16_t));

      /* read the last 16-bit memory part: */
      part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));

      /* copy to the buffer the bytes to read in the first
         16-bit memory part: */
      part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
      count_done = count;
      do {
        *buffer = *part_buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);
    }

  }

  else {

    /* make a 8-bit pointer to the memory: */
    parts8 = (_tme_const tme_shared tme_uint8_t *) mem;

    /* if this pointer is not 8-bit aligned: */
    if (__tme_predict_false((((tme_uintptr_t) parts8) % sizeof(tme_uint8_t)) != 0)) {

      /* get the misalignment from the previous 8-bit boundary: */
      count_misaligned = ((tme_uintptr_t) parts8) % sizeof(tme_uint8_t);

      /* truncate this pointer to the previous 8-bit boundary: */
      parts8 = (_tme_const tme_shared tme_uint8_t *) (((tme_uintptr_t) parts8) & (((tme_uintptr_t) 0) - sizeof(tme_uint8_t)));

      /* get the number of bytes to read in the first 8-bit memory part: */
      count_done = sizeof(tme_uint8_t) - count_misaligned;
      if (__tme_predict_false(count_done > count)) {
        count_done = count;
      }

      /* read the first 8-bit memory part: */
      part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
      parts8++;

      /* copy to the buffer the bytes to read in the first
         8-bit memory part: */
      part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
      count -= count_done;
      do {
        *buffer = *part_buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);
    }

    /* if we have full 8-bit parts to read: */
    if (__tme_predict_true(count >= sizeof(tme_uint8_t))) {

      /* if the buffer is 8-bit aligned: */
      if (__tme_predict_true((((tme_uintptr_t) buffer) % sizeof(tme_uint8_t)) == 0)) {

        /* read full 8-bit parts without shifting: */
        do {
          part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
          tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));

          /* advance: */
          parts8++;
          buffer += sizeof(tme_uint8_t);
          count -= sizeof(tme_uint8_t);
        } while (count >= sizeof(tme_uint8_t));
      }

      /* otherwise, the buffer is not 8-bit aligned: */
      else {

        /* get the misalignment to the next 8-bit boundary: */
        count_misaligned = (sizeof(tme_uint8_t) - ((unsigned int) (tme_uintptr_t) buffer)) % sizeof(tme_uint8_t);

        /* read the next 8-bit memory part: */
        part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
        parts8++;

        /* copy to the buffer until it is aligned: */
        part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
        count_done = count_misaligned;
        count -= count_misaligned;
        do {
          *buffer = *part_buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);

        /* read full 8-bit words with shifting: */
        bits_misaligned = count_misaligned * 8;
        part8
          = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
             ? (part8_buffer >> bits_misaligned)
             : (part8_buffer << bits_misaligned));
        for (; count >= sizeof(tme_uint8_t); ) {
          part8_next = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
          if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
            part8 |= (part8_next << (8 - bits_misaligned));
            tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
            part8 = (part8_next >> bits_misaligned);
          }
          else {
            part8 |= (part8_next >> (8 - bits_misaligned));
            tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
            part8 = (part8_next << bits_misaligned);
          }

          /* advance: */
          parts8++;
          buffer += sizeof(tme_uint8_t);
          count -= sizeof(tme_uint8_t);
        }

        /* calculate how many more bytes there are to read in this
           8-bit memory part: */
        count_done = sizeof(tme_uint8_t) - count_misaligned;
        part8_buffer = part8;

        /* copy to the buffer the remaining bytes in this 8-bit part: */
        if (count_done > count) {
          count_done = count;
        }
        part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
        count -= count_done;
        do {
          *buffer = *part_buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);
      }
    }

    /* if we still have bytes to read: */
    if (__tme_predict_false(count > 0)) {

      /* we must have less than a full 8-bit part to read: */
      assert (count < sizeof(tme_uint8_t));

      /* read the last 8-bit memory part: */
      part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));

      /* copy to the buffer the bytes to read in the first
         8-bit memory part: */
      part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
      count_done = count;
      do {
        *buffer = *part_buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);
    }

  }
}

/* undefine the macro version of tme_memory_bus_write_buffer: */
#undef tme_memory_bus_write_buffer

/* the bus write buffer function: */
void
tme_memory_bus_write_buffer(tme_shared tme_uint8_t *mem, _tme_const tme_uint8_t *buffer, unsigned long count, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
{
  const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
  tme_uint8_t *part_buffer;
  unsigned int count_done;
  unsigned int count_misaligned;
  unsigned int bits_misaligned;
#ifdef TME_HAVE_INT64_T
  tme_shared tme_uint64_t *parts64;
  tme_uint64_t part64_buffer;
  tme_uint64_t part64;
  tme_uint64_t part64_next;
  tme_uint64_t part64_mask;
  tme_uint64_t part64_cmp;
#endif /* TME_HAVE_INT64_T */
  tme_shared tme_uint32_t *parts32;
  tme_uint32_t part32_buffer;
  tme_uint32_t part32;
  tme_uint32_t part32_next;
  tme_uint32_t part32_mask;
  tme_uint32_t part32_cmp;
  tme_shared tme_uint16_t *parts16;
  tme_uint16_t part16_buffer;
  tme_uint16_t part16;
  tme_uint16_t part16_next;
  tme_uint16_t part16_mask;
  tme_uint16_t part16_cmp;
  tme_shared tme_uint8_t *parts8;
  tme_uint8_t part8_buffer;
  tme_uint8_t part8;
  tme_uint8_t part8_next;
  tme_uint8_t part8_mask;
  tme_uint8_t part8_cmp;

  assert (count != 0);
  assert (bus_boundary != 0);

  /* if we are locking for all memory accesses, lock memory
     around a memcpy: */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    tme_rwlock_wrlock(rwlock);
    memcpy((tme_uint8_t *) (mem), (buffer), (count));
    tme_rwlock_wrunlock(rwlock);
  }

  /* otherwise, if the emulated bus boundary is greater than the
     host's bus boundary, we are forced to stop all other threads
     around a memcpy: */
  else if (__tme_predict_false(bus_boundary == 0
                               || bus_boundary > host_boundary)) {
    tme_thread_suspend_others();
    memcpy((tme_uint8_t *) (mem), (buffer), (count) + (0 && align_min));
    tme_thread_resume_others();
  }

#ifdef TME_HAVE_INT64_T

  else if (host_boundary == sizeof(tme_uint64_t)) {

    /* make a 64-bit pointer to the memory: */
    parts64 = (tme_shared tme_uint64_t *) mem;

    /* if this pointer is not 64-bit aligned: */
    if (__tme_predict_false((((tme_uintptr_t) parts64) % sizeof(tme_uint64_t)) != 0)) {

      /* get the misalignment from the previous 64-bit boundary: */
      count_misaligned = ((tme_uintptr_t) parts64) % sizeof(tme_uint64_t);

      /* truncate this pointer to the previous 64-bit boundary: */
      parts64 = (tme_shared tme_uint64_t *) (((tme_uintptr_t) parts64) & (((tme_uintptr_t) 0) - sizeof(tme_uint64_t)));

      /* get the number of bytes to write in the first 64-bit memory part: */
      count_done = sizeof(tme_uint64_t) - count_misaligned;
      if (__tme_predict_false(count_done > count)) {
        count_done = count;
      }

      /* make a mask that clears for the data to write in the
         first 64-bit memory part: */
      part64_mask = 1;
      part64_mask = (part64_mask << (count_done * 8)) - 1;
      part64_mask
        <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
             ? (count_misaligned * 8)
             : (64 - ((count_misaligned + count_done) * 8)));
      part64_mask = ~part64_mask;

      /* copy from the buffer the bytes to write in the first
         64-bit memory part: */
      part64_buffer = 0;
      part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
      count -= count_done;
      do {
        *part_buffer = *buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);

      /* compare-and-exchange the first 64-bit memory part: */
      part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
      do {
        part64_cmp = part64;
        part64 = (part64 & part64_mask) | part64_buffer;
        part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
      } while (part64 != part64_cmp);
      parts64++;
    }

    /* if we have full 64-bit parts to write: */
    if (__tme_predict_true(count >= sizeof(tme_uint64_t))) {

      /* if the buffer is 64-bit aligned: */
      if (__tme_predict_true((((tme_uintptr_t) buffer) % sizeof(tme_uint64_t)) == 0)) {

        /* write full 64-bit parts without shifting: */
        do {
          part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
          tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));

          /* advance: */
          parts64++;
          buffer += sizeof(tme_uint64_t);
          count -= sizeof(tme_uint64_t);
        } while (count >= sizeof(tme_uint64_t));
      }

      /* otherwise, the buffer is not 64-bit aligned: */
      else {

        /* get the misalignment to the next 64-bit boundary: */
        count_misaligned = (sizeof(tme_uint64_t) - ((unsigned int) (tme_uintptr_t) buffer)) % sizeof(tme_uint64_t);

        /* copy from the buffer until it is aligned: */
        part64_buffer = 0;
        part_buffer = ((tme_uint8_t *) &part64_buffer);
        count_done = count_misaligned;
        count -= count_misaligned;
        do {
          *part_buffer = *buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);

        /* write full 64-bit words with shifting: */
        bits_misaligned = count_misaligned * 8;
        part64 = part64_buffer;
        for (; count >= sizeof(tme_uint64_t); ) {
          part64_next = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
          if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
            part64 |= (part64_next << bits_misaligned);
            tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
            part64 = (part64_next >> (64 - bits_misaligned));
          }
          else {
            part64 |= (part64_next >> bits_misaligned);
            tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
            part64 = (part64_next << (64 - bits_misaligned));
          }

          /* advance: */
          parts64++;
          buffer += sizeof(tme_uint64_t);
          count -= sizeof(tme_uint64_t);
        }

        /* calculate how many more bytes there are to write in this
           64-bit memory part: */
        count_done = sizeof(tme_uint64_t) - count_misaligned;
        part64_buffer = part64;

        /* if we can't write one more full 64-bit memory part: */
        if (count_done > count) {

          /* we will reread this data to write below: */
          buffer -= count_misaligned;
          count += count_misaligned;
        }

        /* otherwise, we can write one more full 64-bit memory part: */
        else {

          /* copy from the buffer until we have the full 64-bit part: */
          part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
          count -= count_done;
          do {
            *part_buffer = *buffer;
            part_buffer++;
            buffer++;
          } while (--count_done != 0);

          /* write the last full 64-bit memory part: */
          part64 = part64_buffer;
          tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
        }
      }
    }

    /* if we still have bytes to write: */
    if (__tme_predict_false(count > 0)) {

      /* we must have less than a full 64-bit part to write: */
      assert (count < sizeof(tme_uint64_t));

      /* make a mask that clears for the data to write in the last
         64-bit memory part: */
      part64_mask
        = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
           ? _tme_memory_type_mask(tme_uint64_t, << (count * 8))
           : _tme_memory_type_mask(tme_uint64_t, >> (count * 8)));

      /* copy from the buffer the bytes to write in the last
         64-bit memory part: */
      part64_buffer = 0;
      part_buffer = ((tme_uint8_t *) &part64_buffer);
      count_done = count;
      do {
        *part_buffer = *buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);

      /* compare-and-exchange the last 64-bit memory part: */
      part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
      do {
        part64_cmp = part64;
        part64 = (part64 & part64_mask) | part64_buffer;
        part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
      } while (part64 != part64_cmp);
    }

  }

#endif /* TME_HAVE_INT64_T */

  else if (host_boundary == sizeof(tme_uint32_t)) {

    /* make a 32-bit pointer to the memory: */
    parts32 = (tme_shared tme_uint32_t *) mem;

    /* if this pointer is not 32-bit aligned: */
    if (__tme_predict_false((((tme_uintptr_t) parts32) % sizeof(tme_uint32_t)) != 0)) {

      /* get the misalignment from the previous 32-bit boundary: */
      count_misaligned = ((tme_uintptr_t) parts32) % sizeof(tme_uint32_t);

      /* truncate this pointer to the previous 32-bit boundary: */
      parts32 = (tme_shared tme_uint32_t *) (((tme_uintptr_t) parts32) & (((tme_uintptr_t) 0) - sizeof(tme_uint32_t)));

      /* get the number of bytes to write in the first 32-bit memory part: */
      count_done = sizeof(tme_uint32_t) - count_misaligned;
      if (__tme_predict_false(count_done > count)) {
        count_done = count;
      }

      /* make a mask that clears for the data to write in the
         first 32-bit memory part: */
      part32_mask = 1;
      part32_mask = (part32_mask << (count_done * 8)) - 1;
      part32_mask
        <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
             ? (count_misaligned * 8)
             : (32 - ((count_misaligned + count_done) * 8)));
      part32_mask = ~part32_mask;

      /* copy from the buffer the bytes to write in the first
         32-bit memory part: */
      part32_buffer = 0;
      part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
      count -= count_done;
      do {
        *part_buffer = *buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);

      /* compare-and-exchange the first 32-bit memory part: */
      part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
      do {
        part32_cmp = part32;
        part32 = (part32 & part32_mask) | part32_buffer;
        part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
      } while (part32 != part32_cmp);
      parts32++;
    }

    /* if we have full 32-bit parts to write: */
    if (__tme_predict_true(count >= sizeof(tme_uint32_t))) {

      /* if the buffer is 32-bit aligned: */
      if (__tme_predict_true((((tme_uintptr_t) buffer) % sizeof(tme_uint32_t)) == 0)) {

        /* write full 32-bit parts without shifting: */
        do {
          part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
          tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));

          /* advance: */
          parts32++;
          buffer += sizeof(tme_uint32_t);
          count -= sizeof(tme_uint32_t);
        } while (count >= sizeof(tme_uint32_t));
      }

      /* otherwise, the buffer is not 32-bit aligned: */
      else {

        /* get the misalignment to the next 32-bit boundary: */
        count_misaligned = (sizeof(tme_uint32_t) - ((unsigned int) (tme_uintptr_t) buffer)) % sizeof(tme_uint32_t);

        /* copy from the buffer until it is aligned: */
        part32_buffer = 0;
        part_buffer = ((tme_uint8_t *) &part32_buffer);
        count_done = count_misaligned;
        count -= count_misaligned;
        do {
          *part_buffer = *buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);

        /* write full 32-bit words with shifting: */
        bits_misaligned = count_misaligned * 8;
        part32 = part32_buffer;
        for (; count >= sizeof(tme_uint32_t); ) {
          part32_next = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
          if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
            part32 |= (part32_next << bits_misaligned);
            tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
            part32 = (part32_next >> (32 - bits_misaligned));
          }
          else {
            part32 |= (part32_next >> bits_misaligned);
            tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
            part32 = (part32_next << (32 - bits_misaligned));
          }

          /* advance: */
          parts32++;
          buffer += sizeof(tme_uint32_t);
          count -= sizeof(tme_uint32_t);
        }

        /* calculate how many more bytes there are to write in this
           32-bit memory part: */
        count_done = sizeof(tme_uint32_t) - count_misaligned;
        part32_buffer = part32;

        /* if we can't write one more full 32-bit memory part: */
        if (count_done > count) {

          /* we will reread this data to write below: */
          buffer -= count_misaligned;
          count += count_misaligned;
        }

        /* otherwise, we can write one more full 32-bit memory part: */
        else {

          /* copy from the buffer until we have the full 32-bit part: */
          part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
          count -= count_done;
          do {
            *part_buffer = *buffer;
            part_buffer++;
            buffer++;
          } while (--count_done != 0);

          /* write the last full 32-bit memory part: */
          part32 = part32_buffer;
          tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
        }
      }
    }

    /* if we still have bytes to write: */
    if (__tme_predict_false(count > 0)) {

      /* we must have less than a full 32-bit part to write: */
      assert (count < sizeof(tme_uint32_t));

      /* make a mask that clears for the data to write in the last
         32-bit memory part: */
      part32_mask
        = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
           ? _tme_memory_type_mask(tme_uint32_t, << (count * 8))
           : _tme_memory_type_mask(tme_uint32_t, >> (count * 8)));

      /* copy from the buffer the bytes to write in the last
         32-bit memory part: */
      part32_buffer = 0;
      part_buffer = ((tme_uint8_t *) &part32_buffer);
      count_done = count;
      do {
        *part_buffer = *buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);

      /* compare-and-exchange the last 32-bit memory part: */
      part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
      do {
        part32_cmp = part32;
        part32 = (part32 & part32_mask) | part32_buffer;
        part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
      } while (part32 != part32_cmp);
    }

  }

  else if (host_boundary == sizeof(tme_uint16_t)) {

    /* make a 16-bit pointer to the memory: */
    parts16 = (tme_shared tme_uint16_t *) mem;

    /* if this pointer is not 16-bit aligned: */
    if (__tme_predict_false((((tme_uintptr_t) parts16) % sizeof(tme_uint16_t)) != 0)) {

      /* get the misalignment from the previous 16-bit boundary: */
      count_misaligned = ((tme_uintptr_t) parts16) % sizeof(tme_uint16_t);

      /* truncate this pointer to the previous 16-bit boundary: */
      parts16 = (tme_shared tme_uint16_t *) (((tme_uintptr_t) parts16) & (((tme_uintptr_t) 0) - sizeof(tme_uint16_t)));

      /* get the number of bytes to write in the first 16-bit memory part: */
      count_done = sizeof(tme_uint16_t) - count_misaligned;
      if (__tme_predict_false(count_done > count)) {
        count_done = count;
      }

      /* make a mask that clears for the data to write in the
         first 16-bit memory part: */
      part16_mask = 1;
      part16_mask = (part16_mask << (count_done * 8)) - 1;
      part16_mask
        <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
             ? (count_misaligned * 8)
             : (16 - ((count_misaligned + count_done) * 8)));
      part16_mask = ~part16_mask;

      /* copy from the buffer the bytes to write in the first
         16-bit memory part: */
      part16_buffer = 0;
      part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
      count -= count_done;
      do {
        *part_buffer = *buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);

      /* compare-and-exchange the first 16-bit memory part: */
      part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
      do {
        part16_cmp = part16;
        part16 = (part16 & part16_mask) | part16_buffer;
        part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
      } while (part16 != part16_cmp);
      parts16++;
    }

    /* if we have full 16-bit parts to write: */
    if (__tme_predict_true(count >= sizeof(tme_uint16_t))) {

      /* if the buffer is 16-bit aligned: */
      if (__tme_predict_true((((tme_uintptr_t) buffer) % sizeof(tme_uint16_t)) == 0)) {

        /* write full 16-bit parts without shifting: */
        do {
          part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
          tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));

          /* advance: */
          parts16++;
          buffer += sizeof(tme_uint16_t);
          count -= sizeof(tme_uint16_t);
        } while (count >= sizeof(tme_uint16_t));
      }

      /* otherwise, the buffer is not 16-bit aligned: */
      else {

        /* get the misalignment to the next 16-bit boundary: */
        count_misaligned = (sizeof(tme_uint16_t) - ((unsigned int) (tme_uintptr_t) buffer)) % sizeof(tme_uint16_t);

        /* copy from the buffer until it is aligned: */
        part16_buffer = 0;
        part_buffer = ((tme_uint8_t *) &part16_buffer);
        count_done = count_misaligned;
        count -= count_misaligned;
        do {
          *part_buffer = *buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);

        /* write full 16-bit words with shifting: */
        bits_misaligned = count_misaligned * 8;
        part16 = part16_buffer;
        for (; count >= sizeof(tme_uint16_t); ) {
          part16_next = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
          if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
            part16 |= (part16_next << bits_misaligned);
            tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
            part16 = (part16_next >> (16 - bits_misaligned));
          }
          else {
            part16 |= (part16_next >> bits_misaligned);
            tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
            part16 = (part16_next << (16 - bits_misaligned));
          }

          /* advance: */
          parts16++;
          buffer += sizeof(tme_uint16_t);
          count -= sizeof(tme_uint16_t);
        }

        /* calculate how many more bytes there are to write in this
           16-bit memory part: */
        count_done = sizeof(tme_uint16_t) - count_misaligned;
        part16_buffer = part16;

        /* if we can't write one more full 16-bit memory part: */
        if (count_done > count) {

          /* we will reread this data to write below: */
          buffer -= count_misaligned;
          count += count_misaligned;
        }

        /* otherwise, we can write one more full 16-bit memory part: */
        else {

          /* copy from the buffer until we have the full 16-bit part: */
          part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
          count -= count_done;
          do {
            *part_buffer = *buffer;
            part_buffer++;
            buffer++;
          } while (--count_done != 0);

          /* write the last full 16-bit memory part: */
          part16 = part16_buffer;
          tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
        }
      }
    }

    /* if we still have bytes to write: */
    if (__tme_predict_false(count > 0)) {

      /* we must have less than a full 16-bit part to write: */
      assert (count < sizeof(tme_uint16_t));

      /* make a mask that clears for the data to write in the last
         16-bit memory part: */
      part16_mask
        = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
           ? _tme_memory_type_mask(tme_uint16_t, << (count * 8))
           : _tme_memory_type_mask(tme_uint16_t, >> (count * 8)));

      /* copy from the buffer the bytes to write in the last
         16-bit memory part: */
      part16_buffer = 0;
      part_buffer = ((tme_uint8_t *) &part16_buffer);
      count_done = count;
      do {
        *part_buffer = *buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);

      /* compare-and-exchange the last 16-bit memory part: */
      part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
      do {
        part16_cmp = part16;
        part16 = (part16 & part16_mask) | part16_buffer;
        part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
      } while (part16 != part16_cmp);
    }

  }

  else {

    /* make a 8-bit pointer to the memory: */
    parts8 = (tme_shared tme_uint8_t *) mem;

    /* if this pointer is not 8-bit aligned: */
    if (__tme_predict_false((((tme_uintptr_t) parts8) % sizeof(tme_uint8_t)) != 0)) {

      /* get the misalignment from the previous 8-bit boundary: */
      count_misaligned = ((tme_uintptr_t) parts8) % sizeof(tme_uint8_t);

      /* truncate this pointer to the previous 8-bit boundary: */
      parts8 = (tme_shared tme_uint8_t *) (((tme_uintptr_t) parts8) & (((tme_uintptr_t) 0) - sizeof(tme_uint8_t)));

      /* get the number of bytes to write in the first 8-bit memory part: */
      count_done = sizeof(tme_uint8_t) - count_misaligned;
      if (__tme_predict_false(count_done > count)) {
        count_done = count;
      }

      /* make a mask that clears for the data to write in the
         first 8-bit memory part: */
      part8_mask = 1;
      part8_mask = (part8_mask << (count_done * 8)) - 1;
      part8_mask
        <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
             ? (count_misaligned * 8)
             : (8 - ((count_misaligned + count_done) * 8)));
      part8_mask = ~part8_mask;

      /* copy from the buffer the bytes to write in the first
         8-bit memory part: */
      part8_buffer = 0;
      part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
      count -= count_done;
      do {
        *part_buffer = *buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);

      /* compare-and-exchange the first 8-bit memory part: */
      part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
      do {
        part8_cmp = part8;
        part8 = (part8 & part8_mask) | part8_buffer;
        part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
      } while (part8 != part8_cmp);
      parts8++;
    }

    /* if we have full 8-bit parts to write: */
    if (__tme_predict_true(count >= sizeof(tme_uint8_t))) {

      /* if the buffer is 8-bit aligned: */
      if (__tme_predict_true((((tme_uintptr_t) buffer) % sizeof(tme_uint8_t)) == 0)) {

        /* write full 8-bit parts without shifting: */
        do {
          part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
          tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));

          /* advance: */
          parts8++;
          buffer += sizeof(tme_uint8_t);
          count -= sizeof(tme_uint8_t);
        } while (count >= sizeof(tme_uint8_t));
      }

      /* otherwise, the buffer is not 8-bit aligned: */
      else {

        /* get the misalignment to the next 8-bit boundary: */
        count_misaligned = (sizeof(tme_uint8_t) - ((unsigned int) (tme_uintptr_t) buffer)) % sizeof(tme_uint8_t);

        /* copy from the buffer until it is aligned: */
        part8_buffer = 0;
        part_buffer = ((tme_uint8_t *) &part8_buffer);
        count_done = count_misaligned;
        count -= count_misaligned;
        do {
          *part_buffer = *buffer;
          part_buffer++;
          buffer++;
        } while (--count_done != 0);

        /* write full 8-bit words with shifting: */
        bits_misaligned = count_misaligned * 8;
        part8 = part8_buffer;
        for (; count >= sizeof(tme_uint8_t); ) {
          part8_next = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
          if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
            part8 |= (part8_next << bits_misaligned);
            tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
            part8 = (part8_next >> (8 - bits_misaligned));
          }
          else {
            part8 |= (part8_next >> bits_misaligned);
            tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
            part8 = (part8_next << (8 - bits_misaligned));
          }

          /* advance: */
          parts8++;
          buffer += sizeof(tme_uint8_t);
          count -= sizeof(tme_uint8_t);
        }

        /* calculate how many more bytes there are to write in this
           8-bit memory part: */
        count_done = sizeof(tme_uint8_t) - count_misaligned;
        part8_buffer = part8;

        /* if we can't write one more full 8-bit memory part: */
        if (count_done > count) {

          /* we will reread this data to write below: */
          buffer -= count_misaligned;
          count += count_misaligned;
        }

        /* otherwise, we can write one more full 8-bit memory part: */
        else {

          /* copy from the buffer until we have the full 8-bit part: */
          part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
          count -= count_done;
          do {
            *part_buffer = *buffer;
            part_buffer++;
            buffer++;
          } while (--count_done != 0);

          /* write the last full 8-bit memory part: */
          part8 = part8_buffer;
          tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
        }
      }
    }

    /* if we still have bytes to write: */
    if (__tme_predict_false(count > 0)) {

      /* we must have less than a full 8-bit part to write: */
      assert (count < sizeof(tme_uint8_t));

      /* make a mask that clears for the data to write in the last
         8-bit memory part: */
      part8_mask
        = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
           ? _tme_memory_type_mask(tme_uint8_t, << (count * 8))
           : _tme_memory_type_mask(tme_uint8_t, >> (count * 8)));

      /* copy from the buffer the bytes to write in the last
         8-bit memory part: */
      part8_buffer = 0;
      part_buffer = ((tme_uint8_t *) &part8_buffer);
      count_done = count;
      do {
        *part_buffer = *buffer;
        part_buffer++;
        buffer++;
      } while (--count_done != 0);

      /* compare-and-exchange the last 8-bit memory part: */
      part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
      do {
        part8_cmp = part8;
        part8 = (part8 & part8_mask) | part8_buffer;
        part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
      } while (part8 != part8_cmp);
    }

  }
}

/* the 8-bit atomic operations: */

/* undefine any macro version of tme_memory_atomic_add8: */
#undef tme_memory_atomic_add8

/* the 8-bit atomic add function: */
tme_uint8_t
tme_memory_atomic_add8(tme_shared tme_uint8_t *memory,
                        tme_uint8_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;
  tme_uint8_t value_written;
  tme_uint8_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    value_written = value_read + operand;
    tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     add at all, or if it can't do it at this alignment.

     we emulate the atomic 8-bit add with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read8(memory, rwlock, align_min);

    /* spin the add in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read + operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_sub8: */
#undef tme_memory_atomic_sub8

/* the 8-bit atomic sub function: */
tme_uint8_t
tme_memory_atomic_sub8(tme_shared tme_uint8_t *memory,
                        tme_uint8_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;
  tme_uint8_t value_written;
  tme_uint8_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    value_written = value_read - operand;
    tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     sub at all, or if it can't do it at this alignment.

     we emulate the atomic 8-bit sub with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read8(memory, rwlock, align_min);

    /* spin the sub in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read - operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_mul8: */
#undef tme_memory_atomic_mul8

/* the 8-bit atomic mul function: */
tme_uint8_t
tme_memory_atomic_mul8(tme_shared tme_uint8_t *memory,
                        tme_uint8_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;
  tme_uint8_t value_written;
  tme_uint8_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    value_written = value_read * operand;
    tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     mul at all, or if it can't do it at this alignment.

     we emulate the atomic 8-bit mul with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read8(memory, rwlock, align_min);

    /* spin the mul in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read * operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_div8: */
#undef tme_memory_atomic_div8

/* the 8-bit atomic div function: */
tme_uint8_t
tme_memory_atomic_div8(tme_shared tme_uint8_t *memory,
                        tme_uint8_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;
  tme_uint8_t value_written;
  tme_uint8_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    value_written = value_read / operand;
    tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     div at all, or if it can't do it at this alignment.

     we emulate the atomic 8-bit div with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read8(memory, rwlock, align_min);

    /* spin the div in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read / operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_and8: */
#undef tme_memory_atomic_and8

/* the 8-bit atomic and function: */
tme_uint8_t
tme_memory_atomic_and8(tme_shared tme_uint8_t *memory,
                        tme_uint8_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;
  tme_uint8_t value_written;
  tme_uint8_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    value_written = value_read & operand;
    tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     and at all, or if it can't do it at this alignment.

     we emulate the atomic 8-bit and with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read8(memory, rwlock, align_min);

    /* spin the and in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read & operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_or8: */
#undef tme_memory_atomic_or8

/* the 8-bit atomic or function: */
tme_uint8_t
tme_memory_atomic_or8(tme_shared tme_uint8_t *memory,
                        tme_uint8_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;
  tme_uint8_t value_written;
  tme_uint8_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    value_written = value_read | operand;
    tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     or at all, or if it can't do it at this alignment.

     we emulate the atomic 8-bit or with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read8(memory, rwlock, align_min);

    /* spin the or in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read | operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_xor8: */
#undef tme_memory_atomic_xor8

/* the 8-bit atomic xor function: */
tme_uint8_t
tme_memory_atomic_xor8(tme_shared tme_uint8_t *memory,
                        tme_uint8_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;
  tme_uint8_t value_written;
  tme_uint8_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    value_written = value_read ^ operand;
    tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     xor at all, or if it can't do it at this alignment.

     we emulate the atomic 8-bit xor with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read8(memory, rwlock, align_min);

    /* spin the xor in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read ^ operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_not8: */
#undef tme_memory_atomic_not8

/* the 8-bit atomic not function: */
tme_uint8_t
tme_memory_atomic_not8(tme_shared tme_uint8_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;
  tme_uint8_t value_written;
  tme_uint8_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    value_written = ~value_read;
    tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     not at all, or if it can't do it at this alignment.

     we emulate the atomic 8-bit not with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read8(memory, rwlock, align_min);

    /* spin the not in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = ~value_read;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_neg8: */
#undef tme_memory_atomic_neg8

/* the 8-bit atomic neg function: */
tme_uint8_t
tme_memory_atomic_neg8(tme_shared tme_uint8_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;
  tme_uint8_t value_written;
  tme_uint8_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    value_written = 0 - value_read;
    tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     neg at all, or if it can't do it at this alignment.

     we emulate the atomic 8-bit neg with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read8(memory, rwlock, align_min);

    /* spin the neg in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = 0 - value_read;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_xchg8: */
#undef tme_memory_atomic_xchg8

/* the 8-bit atomic xchg function: */
tme_uint8_t
tme_memory_atomic_xchg8(tme_shared tme_uint8_t *memory,
                        tme_uint8_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;
  tme_uint8_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     xchg at all, or if it can't do it at this alignment.

     we emulate the atomic 8-bit xchg with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read8(memory, rwlock, align_min);

    /* spin the xchg in a compare-and-exchange loop: */
    for (;;) {

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_cx8: */
#undef tme_memory_atomic_cx8

/* the 8-bit atomic cx function: */
tme_uint8_t
tme_memory_atomic_cx8(tme_shared tme_uint8_t *memory,
                        tme_uint8_t value_cmp,
                        tme_uint8_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint8_t value_read;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    if (value_read == value_cmp) {
      tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    }
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 8-bit
     cx at all, or if it can't do it at this alignment.

     we assume that these problematic atomic cxs are rare,
     and to emulate them we simply stop all other threads while
     doing the cx: */
  else {
    tme_thread_suspend_others();
    value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
    if (value_read == value_cmp) {
      tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
    }
    tme_thread_resume_others();
  }

  /* return the value read: */
  return (value_read);
}

/* the 16-bit atomic operations: */

/* undefine any macro version of tme_memory_atomic_add16: */
#undef tme_memory_atomic_add16

/* the 16-bit atomic add function: */
tme_uint16_t
tme_memory_atomic_add16(tme_shared tme_uint16_t *memory,
                        tme_uint16_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;
  tme_uint16_t value_written;
  tme_uint16_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    value_written = value_read + operand;
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     add at all, or if it can't do it at this alignment.

     we emulate the atomic 16-bit add with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read16(memory, rwlock, align_min);

    /* spin the add in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read + operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_sub16: */
#undef tme_memory_atomic_sub16

/* the 16-bit atomic sub function: */
tme_uint16_t
tme_memory_atomic_sub16(tme_shared tme_uint16_t *memory,
                        tme_uint16_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;
  tme_uint16_t value_written;
  tme_uint16_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    value_written = value_read - operand;
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     sub at all, or if it can't do it at this alignment.

     we emulate the atomic 16-bit sub with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read16(memory, rwlock, align_min);

    /* spin the sub in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read - operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_mul16: */
#undef tme_memory_atomic_mul16

/* the 16-bit atomic mul function: */
tme_uint16_t
tme_memory_atomic_mul16(tme_shared tme_uint16_t *memory,
                        tme_uint16_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;
  tme_uint16_t value_written;
  tme_uint16_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    value_written = value_read * operand;
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     mul at all, or if it can't do it at this alignment.

     we emulate the atomic 16-bit mul with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read16(memory, rwlock, align_min);

    /* spin the mul in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read * operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_div16: */
#undef tme_memory_atomic_div16

/* the 16-bit atomic div function: */
tme_uint16_t
tme_memory_atomic_div16(tme_shared tme_uint16_t *memory,
                        tme_uint16_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;
  tme_uint16_t value_written;
  tme_uint16_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    value_written = value_read / operand;
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     div at all, or if it can't do it at this alignment.

     we emulate the atomic 16-bit div with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read16(memory, rwlock, align_min);

    /* spin the div in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read / operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_and16: */
#undef tme_memory_atomic_and16

/* the 16-bit atomic and function: */
tme_uint16_t
tme_memory_atomic_and16(tme_shared tme_uint16_t *memory,
                        tme_uint16_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;
  tme_uint16_t value_written;
  tme_uint16_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    value_written = value_read & operand;
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     and at all, or if it can't do it at this alignment.

     we emulate the atomic 16-bit and with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read16(memory, rwlock, align_min);

    /* spin the and in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read & operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_or16: */
#undef tme_memory_atomic_or16

/* the 16-bit atomic or function: */
tme_uint16_t
tme_memory_atomic_or16(tme_shared tme_uint16_t *memory,
                        tme_uint16_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;
  tme_uint16_t value_written;
  tme_uint16_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    value_written = value_read | operand;
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     or at all, or if it can't do it at this alignment.

     we emulate the atomic 16-bit or with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read16(memory, rwlock, align_min);

    /* spin the or in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read | operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_xor16: */
#undef tme_memory_atomic_xor16

/* the 16-bit atomic xor function: */
tme_uint16_t
tme_memory_atomic_xor16(tme_shared tme_uint16_t *memory,
                        tme_uint16_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;
  tme_uint16_t value_written;
  tme_uint16_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    value_written = value_read ^ operand;
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     xor at all, or if it can't do it at this alignment.

     we emulate the atomic 16-bit xor with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read16(memory, rwlock, align_min);

    /* spin the xor in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read ^ operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_not16: */
#undef tme_memory_atomic_not16

/* the 16-bit atomic not function: */
tme_uint16_t
tme_memory_atomic_not16(tme_shared tme_uint16_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;
  tme_uint16_t value_written;
  tme_uint16_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    value_written = ~value_read;
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     not at all, or if it can't do it at this alignment.

     we emulate the atomic 16-bit not with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read16(memory, rwlock, align_min);

    /* spin the not in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = ~value_read;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_neg16: */
#undef tme_memory_atomic_neg16

/* the 16-bit atomic neg function: */
tme_uint16_t
tme_memory_atomic_neg16(tme_shared tme_uint16_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;
  tme_uint16_t value_written;
  tme_uint16_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    value_written = 0 - value_read;
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     neg at all, or if it can't do it at this alignment.

     we emulate the atomic 16-bit neg with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read16(memory, rwlock, align_min);

    /* spin the neg in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = 0 - value_read;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_xchg16: */
#undef tme_memory_atomic_xchg16

/* the 16-bit atomic xchg function: */
tme_uint16_t
tme_memory_atomic_xchg16(tme_shared tme_uint16_t *memory,
                        tme_uint16_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;
  tme_uint16_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     xchg at all, or if it can't do it at this alignment.

     we emulate the atomic 16-bit xchg with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read16(memory, rwlock, align_min);

    /* spin the xchg in a compare-and-exchange loop: */
    for (;;) {

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_cx16: */
#undef tme_memory_atomic_cx16

/* the 16-bit atomic cx function: */
tme_uint16_t
tme_memory_atomic_cx16(tme_shared tme_uint16_t *memory,
                        tme_uint16_t value_cmp,
                        tme_uint16_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    if (value_read == value_cmp) {
      tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    }
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     cx at all, or if it can't do it at this alignment.

     we assume that these problematic atomic cxs are rare,
     and to emulate them we simply stop all other threads while
     doing the cx: */
  else {
    tme_thread_suspend_others();
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    if (value_read == value_cmp) {
      tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    }
    tme_thread_resume_others();
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_read16: */
#undef tme_memory_atomic_read16

/* the 16-bit atomic read function: */
tme_uint16_t
tme_memory_atomic_read16(_tme_const tme_shared tme_uint16_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint16_t value_read;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_rdlock(rwlock);
    }
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_rdunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     read at all, or if it can't do it at this alignment.

     we assume that these problematic atomic reads are rare,
     and to emulate them we simply stop all other threads while
     doing the read: */
  else {
    tme_thread_suspend_others();
    value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
    tme_thread_resume_others();
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_write16: */
#undef tme_memory_atomic_write16

/* the 16-bit atomic write function: */
void
tme_memory_atomic_write16(tme_shared tme_uint16_t *memory,
                        tme_uint16_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 16-bit
     write at all, or if it can't do it at this alignment.

     we assume that these problematic atomic writes are rare,
     and to emulate them we simply stop all other threads while
     doing the write: */
  else {
    tme_thread_suspend_others();
    tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
    tme_thread_resume_others();
  }
}

/* the 32-bit atomic operations: */

/* undefine any macro version of tme_memory_atomic_add32: */
#undef tme_memory_atomic_add32

/* the 32-bit atomic add function: */
tme_uint32_t
tme_memory_atomic_add32(tme_shared tme_uint32_t *memory,
                        tme_uint32_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;
  tme_uint32_t value_written;
  tme_uint32_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    value_written = value_read + operand;
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     add at all, or if it can't do it at this alignment.

     we emulate the atomic 32-bit add with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read32(memory, rwlock, align_min);

    /* spin the add in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read + operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_sub32: */
#undef tme_memory_atomic_sub32

/* the 32-bit atomic sub function: */
tme_uint32_t
tme_memory_atomic_sub32(tme_shared tme_uint32_t *memory,
                        tme_uint32_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;
  tme_uint32_t value_written;
  tme_uint32_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    value_written = value_read - operand;
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     sub at all, or if it can't do it at this alignment.

     we emulate the atomic 32-bit sub with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read32(memory, rwlock, align_min);

    /* spin the sub in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read - operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_mul32: */
#undef tme_memory_atomic_mul32

/* the 32-bit atomic mul function: */
tme_uint32_t
tme_memory_atomic_mul32(tme_shared tme_uint32_t *memory,
                        tme_uint32_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;
  tme_uint32_t value_written;
  tme_uint32_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    value_written = value_read * operand;
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     mul at all, or if it can't do it at this alignment.

     we emulate the atomic 32-bit mul with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read32(memory, rwlock, align_min);

    /* spin the mul in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read * operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_div32: */
#undef tme_memory_atomic_div32

/* the 32-bit atomic div function: */
tme_uint32_t
tme_memory_atomic_div32(tme_shared tme_uint32_t *memory,
                        tme_uint32_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;
  tme_uint32_t value_written;
  tme_uint32_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    value_written = value_read / operand;
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     div at all, or if it can't do it at this alignment.

     we emulate the atomic 32-bit div with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read32(memory, rwlock, align_min);

    /* spin the div in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read / operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_and32: */
#undef tme_memory_atomic_and32

/* the 32-bit atomic and function: */
tme_uint32_t
tme_memory_atomic_and32(tme_shared tme_uint32_t *memory,
                        tme_uint32_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;
  tme_uint32_t value_written;
  tme_uint32_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    value_written = value_read & operand;
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     and at all, or if it can't do it at this alignment.

     we emulate the atomic 32-bit and with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read32(memory, rwlock, align_min);

    /* spin the and in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read & operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_or32: */
#undef tme_memory_atomic_or32

/* the 32-bit atomic or function: */
tme_uint32_t
tme_memory_atomic_or32(tme_shared tme_uint32_t *memory,
                        tme_uint32_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;
  tme_uint32_t value_written;
  tme_uint32_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    value_written = value_read | operand;
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     or at all, or if it can't do it at this alignment.

     we emulate the atomic 32-bit or with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read32(memory, rwlock, align_min);

    /* spin the or in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read | operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_xor32: */
#undef tme_memory_atomic_xor32

/* the 32-bit atomic xor function: */
tme_uint32_t
tme_memory_atomic_xor32(tme_shared tme_uint32_t *memory,
                        tme_uint32_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;
  tme_uint32_t value_written;
  tme_uint32_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    value_written = value_read ^ operand;
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     xor at all, or if it can't do it at this alignment.

     we emulate the atomic 32-bit xor with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read32(memory, rwlock, align_min);

    /* spin the xor in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read ^ operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_not32: */
#undef tme_memory_atomic_not32

/* the 32-bit atomic not function: */
tme_uint32_t
tme_memory_atomic_not32(tme_shared tme_uint32_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;
  tme_uint32_t value_written;
  tme_uint32_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    value_written = ~value_read;
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     not at all, or if it can't do it at this alignment.

     we emulate the atomic 32-bit not with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read32(memory, rwlock, align_min);

    /* spin the not in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = ~value_read;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_neg32: */
#undef tme_memory_atomic_neg32

/* the 32-bit atomic neg function: */
tme_uint32_t
tme_memory_atomic_neg32(tme_shared tme_uint32_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;
  tme_uint32_t value_written;
  tme_uint32_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    value_written = 0 - value_read;
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     neg at all, or if it can't do it at this alignment.

     we emulate the atomic 32-bit neg with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read32(memory, rwlock, align_min);

    /* spin the neg in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = 0 - value_read;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_xchg32: */
#undef tme_memory_atomic_xchg32

/* the 32-bit atomic xchg function: */
tme_uint32_t
tme_memory_atomic_xchg32(tme_shared tme_uint32_t *memory,
                        tme_uint32_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;
  tme_uint32_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     xchg at all, or if it can't do it at this alignment.

     we emulate the atomic 32-bit xchg with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read32(memory, rwlock, align_min);

    /* spin the xchg in a compare-and-exchange loop: */
    for (;;) {

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_cx32: */
#undef tme_memory_atomic_cx32

/* the 32-bit atomic cx function: */
tme_uint32_t
tme_memory_atomic_cx32(tme_shared tme_uint32_t *memory,
                        tme_uint32_t value_cmp,
                        tme_uint32_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    if (value_read == value_cmp) {
      tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    }
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     cx at all, or if it can't do it at this alignment.

     we assume that these problematic atomic cxs are rare,
     and to emulate them we simply stop all other threads while
     doing the cx: */
  else {
    tme_thread_suspend_others();
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    if (value_read == value_cmp) {
      tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    }
    tme_thread_resume_others();
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_read32: */
#undef tme_memory_atomic_read32

/* the 32-bit atomic read function: */
tme_uint32_t
tme_memory_atomic_read32(_tme_const tme_shared tme_uint32_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint32_t value_read;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_rdlock(rwlock);
    }
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_rdunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     read at all, or if it can't do it at this alignment.

     we assume that these problematic atomic reads are rare,
     and to emulate them we simply stop all other threads while
     doing the read: */
  else {
    tme_thread_suspend_others();
    value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
    tme_thread_resume_others();
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_write32: */
#undef tme_memory_atomic_write32

/* the 32-bit atomic write function: */
void
tme_memory_atomic_write32(tme_shared tme_uint32_t *memory,
                        tme_uint32_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 32-bit
     write at all, or if it can't do it at this alignment.

     we assume that these problematic atomic writes are rare,
     and to emulate them we simply stop all other threads while
     doing the write: */
  else {
    tme_thread_suspend_others();
    tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
    tme_thread_resume_others();
  }
}

#ifdef TME_HAVE_INT64_T

/* the 64-bit atomic operations: */

/* undefine any macro version of tme_memory_atomic_add64: */
#undef tme_memory_atomic_add64

/* the 64-bit atomic add function: */
tme_uint64_t
tme_memory_atomic_add64(tme_shared tme_uint64_t *memory,
                        tme_uint64_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;
  tme_uint64_t value_written;
  tme_uint64_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    value_written = value_read + operand;
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     add at all, or if it can't do it at this alignment.

     we emulate the atomic 64-bit add with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read64(memory, rwlock, align_min);

    /* spin the add in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read + operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_sub64: */
#undef tme_memory_atomic_sub64

/* the 64-bit atomic sub function: */
tme_uint64_t
tme_memory_atomic_sub64(tme_shared tme_uint64_t *memory,
                        tme_uint64_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;
  tme_uint64_t value_written;
  tme_uint64_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    value_written = value_read - operand;
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     sub at all, or if it can't do it at this alignment.

     we emulate the atomic 64-bit sub with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read64(memory, rwlock, align_min);

    /* spin the sub in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read - operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_mul64: */
#undef tme_memory_atomic_mul64

/* the 64-bit atomic mul function: */
tme_uint64_t
tme_memory_atomic_mul64(tme_shared tme_uint64_t *memory,
                        tme_uint64_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;
  tme_uint64_t value_written;
  tme_uint64_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    value_written = value_read * operand;
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     mul at all, or if it can't do it at this alignment.

     we emulate the atomic 64-bit mul with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read64(memory, rwlock, align_min);

    /* spin the mul in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read * operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_div64: */
#undef tme_memory_atomic_div64

/* the 64-bit atomic div function: */
tme_uint64_t
tme_memory_atomic_div64(tme_shared tme_uint64_t *memory,
                        tme_uint64_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;
  tme_uint64_t value_written;
  tme_uint64_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    value_written = value_read / operand;
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     div at all, or if it can't do it at this alignment.

     we emulate the atomic 64-bit div with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read64(memory, rwlock, align_min);

    /* spin the div in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read / operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_and64: */
#undef tme_memory_atomic_and64

/* the 64-bit atomic and function: */
tme_uint64_t
tme_memory_atomic_and64(tme_shared tme_uint64_t *memory,
                        tme_uint64_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;
  tme_uint64_t value_written;
  tme_uint64_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    value_written = value_read & operand;
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     and at all, or if it can't do it at this alignment.

     we emulate the atomic 64-bit and with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read64(memory, rwlock, align_min);

    /* spin the and in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read & operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_or64: */
#undef tme_memory_atomic_or64

/* the 64-bit atomic or function: */
tme_uint64_t
tme_memory_atomic_or64(tme_shared tme_uint64_t *memory,
                        tme_uint64_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;
  tme_uint64_t value_written;
  tme_uint64_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    value_written = value_read | operand;
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     or at all, or if it can't do it at this alignment.

     we emulate the atomic 64-bit or with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read64(memory, rwlock, align_min);

    /* spin the or in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read | operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_xor64: */
#undef tme_memory_atomic_xor64

/* the 64-bit atomic xor function: */
tme_uint64_t
tme_memory_atomic_xor64(tme_shared tme_uint64_t *memory,
                        tme_uint64_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;
  tme_uint64_t value_written;
  tme_uint64_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    value_written = value_read ^ operand;
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     xor at all, or if it can't do it at this alignment.

     we emulate the atomic 64-bit xor with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read64(memory, rwlock, align_min);

    /* spin the xor in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read ^ operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_not64: */
#undef tme_memory_atomic_not64

/* the 64-bit atomic not function: */
tme_uint64_t
tme_memory_atomic_not64(tme_shared tme_uint64_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;
  tme_uint64_t value_written;
  tme_uint64_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    value_written = ~value_read;
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     not at all, or if it can't do it at this alignment.

     we emulate the atomic 64-bit not with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read64(memory, rwlock, align_min);

    /* spin the not in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = ~value_read;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_neg64: */
#undef tme_memory_atomic_neg64

/* the 64-bit atomic neg function: */
tme_uint64_t
tme_memory_atomic_neg64(tme_shared tme_uint64_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;
  tme_uint64_t value_written;
  tme_uint64_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    value_written = 0 - value_read;
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     neg at all, or if it can't do it at this alignment.

     we emulate the atomic 64-bit neg with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read64(memory, rwlock, align_min);

    /* spin the neg in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = 0 - value_read;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_xchg64: */
#undef tme_memory_atomic_xchg64

/* the 64-bit atomic xchg function: */
tme_uint64_t
tme_memory_atomic_xchg64(tme_shared tme_uint64_t *memory,
                        tme_uint64_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;
  tme_uint64_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     xchg at all, or if it can't do it at this alignment.

     we emulate the atomic 64-bit xchg with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read64(memory, rwlock, align_min);

    /* spin the xchg in a compare-and-exchange loop: */
    for (;;) {

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_cx64: */
#undef tme_memory_atomic_cx64

/* the 64-bit atomic cx function: */
tme_uint64_t
tme_memory_atomic_cx64(tme_shared tme_uint64_t *memory,
                        tme_uint64_t value_cmp,
                        tme_uint64_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    if (value_read == value_cmp) {
      tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    }
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     cx at all, or if it can't do it at this alignment.

     we assume that these problematic atomic cxs are rare,
     and to emulate them we simply stop all other threads while
     doing the cx: */
  else {
    tme_thread_suspend_others();
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    if (value_read == value_cmp) {
      tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    }
    tme_thread_resume_others();
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_read64: */
#undef tme_memory_atomic_read64

/* the 64-bit atomic read function: */
tme_uint64_t
tme_memory_atomic_read64(_tme_const tme_shared tme_uint64_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint64_t value_read;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_rdlock(rwlock);
    }
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_rdunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     read at all, or if it can't do it at this alignment.

     we assume that these problematic atomic reads are rare,
     and to emulate them we simply stop all other threads while
     doing the read: */
  else {
    tme_thread_suspend_others();
    value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
    tme_thread_resume_others();
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_write64: */
#undef tme_memory_atomic_write64

/* the 64-bit atomic write function: */
void
tme_memory_atomic_write64(tme_shared tme_uint64_t *memory,
                        tme_uint64_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 64-bit
     write at all, or if it can't do it at this alignment.

     we assume that these problematic atomic writes are rare,
     and to emulate them we simply stop all other threads while
     doing the write: */
  else {
    tme_thread_suspend_others();
    tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
    tme_thread_resume_others();
  }
}

#endif /* TME_HAVE_INT64_T */

#ifdef TME_HAVE_INT128_T

/* the 128-bit atomic operations: */

/* undefine any macro version of tme_memory_atomic_add128: */
#undef tme_memory_atomic_add128

/* the 128-bit atomic add function: */
tme_uint128_t
tme_memory_atomic_add128(tme_shared tme_uint128_t *memory,
                        tme_uint128_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;
  tme_uint128_t value_written;
  tme_uint128_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    value_written = value_read + operand;
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     add at all, or if it can't do it at this alignment.

     we emulate the atomic 128-bit add with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read128(memory, rwlock, align_min);

    /* spin the add in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read + operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx128(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_sub128: */
#undef tme_memory_atomic_sub128

/* the 128-bit atomic sub function: */
tme_uint128_t
tme_memory_atomic_sub128(tme_shared tme_uint128_t *memory,
                        tme_uint128_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;
  tme_uint128_t value_written;
  tme_uint128_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    value_written = value_read - operand;
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     sub at all, or if it can't do it at this alignment.

     we emulate the atomic 128-bit sub with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read128(memory, rwlock, align_min);

    /* spin the sub in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read - operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx128(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_mul128: */
#undef tme_memory_atomic_mul128

/* the 128-bit atomic mul function: */
tme_uint128_t
tme_memory_atomic_mul128(tme_shared tme_uint128_t *memory,
                        tme_uint128_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;
  tme_uint128_t value_written;
  tme_uint128_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    value_written = value_read * operand;
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     mul at all, or if it can't do it at this alignment.

     we emulate the atomic 128-bit mul with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read128(memory, rwlock, align_min);

    /* spin the mul in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read * operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx128(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_div128: */
#undef tme_memory_atomic_div128

/* the 128-bit atomic div function: */
tme_uint128_t
tme_memory_atomic_div128(tme_shared tme_uint128_t *memory,
                        tme_uint128_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;
  tme_uint128_t value_written;
  tme_uint128_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    value_written = value_read / operand;
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     div at all, or if it can't do it at this alignment.

     we emulate the atomic 128-bit div with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read128(memory, rwlock, align_min);

    /* spin the div in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read / operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx128(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_and128: */
#undef tme_memory_atomic_and128

/* the 128-bit atomic and function: */
tme_uint128_t
tme_memory_atomic_and128(tme_shared tme_uint128_t *memory,
                        tme_uint128_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;
  tme_uint128_t value_written;
  tme_uint128_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    value_written = value_read & operand;
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     and at all, or if it can't do it at this alignment.

     we emulate the atomic 128-bit and with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read128(memory, rwlock, align_min);

    /* spin the and in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read & operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx128(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_or128: */
#undef tme_memory_atomic_or128

/* the 128-bit atomic or function: */
tme_uint128_t
tme_memory_atomic_or128(tme_shared tme_uint128_t *memory,
                        tme_uint128_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;
  tme_uint128_t value_written;
  tme_uint128_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    value_written = value_read | operand;
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     or at all, or if it can't do it at this alignment.

     we emulate the atomic 128-bit or with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read128(memory, rwlock, align_min);

    /* spin the or in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read | operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx128(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_xor128: */
#undef tme_memory_atomic_xor128

/* the 128-bit atomic xor function: */
tme_uint128_t
tme_memory_atomic_xor128(tme_shared tme_uint128_t *memory,
                        tme_uint128_t operand,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;
  tme_uint128_t value_written;
  tme_uint128_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    value_written = value_read ^ operand;
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     xor at all, or if it can't do it at this alignment.

     we emulate the atomic 128-bit xor with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read128(memory, rwlock, align_min);

    /* spin the xor in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = value_read ^ operand;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx128(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_not128: */
#undef tme_memory_atomic_not128

/* the 128-bit atomic not function: */
tme_uint128_t
tme_memory_atomic_not128(tme_shared tme_uint128_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;
  tme_uint128_t value_written;
  tme_uint128_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    value_written = ~value_read;
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     not at all, or if it can't do it at this alignment.

     we emulate the atomic 128-bit not with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read128(memory, rwlock, align_min);

    /* spin the not in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = ~value_read;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx128(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_neg128: */
#undef tme_memory_atomic_neg128

/* the 128-bit atomic neg function: */
tme_uint128_t
tme_memory_atomic_neg128(tme_shared tme_uint128_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;
  tme_uint128_t value_written;
  tme_uint128_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    value_written = 0 - value_read;
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     neg at all, or if it can't do it at this alignment.

     we emulate the atomic 128-bit neg with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read128(memory, rwlock, align_min);

    /* spin the neg in a compare-and-exchange loop: */
    for (;;) {

      /* make the value to write: */
      value_written = 0 - value_read;

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx128(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_xchg128: */
#undef tme_memory_atomic_xchg128

/* the 128-bit atomic xchg function: */
tme_uint128_t
tme_memory_atomic_xchg128(tme_shared tme_uint128_t *memory,
                        tme_uint128_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;
  tme_uint128_t value_read_verify;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     xchg at all, or if it can't do it at this alignment.

     we emulate the atomic 128-bit xchg with a compare-and-exchange: */
  else {

    /* do an atomic read of the memory: */
    value_read = tme_memory_atomic_read128(memory, rwlock, align_min);

    /* spin the xchg in a compare-and-exchange loop: */
    for (;;) {

      /* try the compare-and-exchange: */
      value_read_verify = tme_memory_atomic_cx128(memory, value_read, value_written, rwlock, align_min);

      /* if the compare-and-exchange failed: */
      if (__tme_predict_false(value_read_verify != value_read)) {

        /* loop with the new value read from the memory: */
        value_read = value_read_verify;
        continue;
      }

      /* stop now: */
      break;
    }
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_cx128: */
#undef tme_memory_atomic_cx128

/* the 128-bit atomic cx function: */
tme_uint128_t
tme_memory_atomic_cx128(tme_shared tme_uint128_t *memory,
                        tme_uint128_t value_cmp,
                        tme_uint128_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    if (value_read == value_cmp) {
      tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    }
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     cx at all, or if it can't do it at this alignment.

     we assume that these problematic atomic cxs are rare,
     and to emulate them we simply stop all other threads while
     doing the cx: */
  else {
    tme_thread_suspend_others();
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    if (value_read == value_cmp) {
      tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    }
    tme_thread_resume_others();
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_read128: */
#undef tme_memory_atomic_read128

/* the 128-bit atomic read function: */
tme_uint128_t
tme_memory_atomic_read128(_tme_const tme_shared tme_uint128_t *memory,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{
  tme_uint128_t value_read;

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_rdlock(rwlock);
    }
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_rdunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     read at all, or if it can't do it at this alignment.

     we assume that these problematic atomic reads are rare,
     and to emulate them we simply stop all other threads while
     doing the read: */
  else {
    tme_thread_suspend_others();
    value_read = tme_memory_read128((_tme_const tme_uint128_t *) memory, align_min);
    tme_thread_resume_others();
  }

  /* return the value read: */
  return (value_read);
}

/* undefine any macro version of tme_memory_atomic_write128: */
#undef tme_memory_atomic_write128

/* the 128-bit atomic write function: */
void
tme_memory_atomic_write128(tme_shared tme_uint128_t *memory,
                        tme_uint128_t value_written,
                        tme_rwlock_t *rwlock,
                        unsigned int align_min)
{

  /* if we can't make direct accesses at all, all atomic
     accesses must be done under lock.  (when threads are
     cooperative the actual locking isn't needed): */
  if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrlock(rwlock);
    }
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    if (!tme_thread_cooperative()) {
      tme_rwlock_wrunlock(rwlock);
    }
  }

  /* otherwise, threads are not cooperative and this host CPU
     can make atomic accesses to at least the most common memory
     size.

     in that case, the only reason this function should get
     called is if the host CPU can't do an atomic 128-bit
     write at all, or if it can't do it at this alignment.

     we assume that these problematic atomic writes are rare,
     and to emulate them we simply stop all other threads while
     doing the write: */
  else {
    tme_thread_suspend_others();
    tme_memory_write128((tme_uint128_t *) memory, value_written, align_min);
    tme_thread_resume_others();
  }
}

#endif /* TME_HAVE_INT128_T */
