OK, here I comes. If I am not mistaken, it's faster than the old
routine on anything but 1 byte transfers, with 2 byte transfers
having about the same cost. But I am still a little in doubt about the
costs of different type of branches.
/*
* linux/arch/alpha/lib/memcpy.c
*
* Copyright (C) 1995 Linus Torvalds
* Copyright (C) 1996 Martin Ostermann
*/
/*
* This is a reasonably optimized memcpy() routine.
*/
/*
* Note that the C code is written to be optimized into good assembly. However,
* at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
* explicit compare against 0 (instead of just using the proper "blt reg, xx" or
* "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
*/
#include <linux/types.h>
/*
* we're not allowed to read more than specified by [src..src+count). Also just write
* from [dst..dst+count), since adjecent quadwords may belong to a different page, maybe
* not accessible to us. This is what makes this function complicated.
*/
static inline void
__memcpy_unaligned( unsigned long dst, unsigned long src_org, long count_org)
{
unsigned long low_word, high_word,last_read,src;
long rm,count,loop;
unsigned long tmp,org,org2,mask;
rm = dst & 7;
count = count_org + rm;
__asm__("ldq_u %0,%1":"=r" (org):"m" (*(unsigned long *)(dst)));
__asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *)(src_org)));
src = src_org - rm;
if( count > 8 ) {
last_read = src_org+count_org;
__asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(src+8)));
__asm__("extql %1,%2,%0"
:"=r" (low_word)
:"r" (low_word), "r" ((unsigned long)(src)));
__asm__("extqh %1,%2,%0"
:"=r" (tmp)
:"r" (high_word), "r" ((unsigned long)(src)));
tmp |= low_word;
src += 8;
__asm__("mskqh %1,%2,%0"
:"=r" (tmp)
:"r" (tmp), "r" (dst));
__asm__("mskql %1,%2,%0"
:"=r" (org2)
:"r" (org), "r" (dst));
tmp |= org2;
loop = (count-8) >> 3; /* loop eqv. count>=16 ; count -= 8 */
while (loop) { /* tmp to be stored completly -- need to read next word*/
low_word = high_word;
__asm__("stq_u %1,%0":"=m" (*(unsigned long*)(dst)):"r" (tmp));
__asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long*)(src+8)));
loop --;
__asm__("extql %1,%2,%0"
:"=r" (low_word)
:"r" (low_word), "r" ((unsigned long)src));
__asm__("extqh %1,%2,%0"
:"=r" (tmp)
:"r" (high_word), "r" ((unsigned long)src));
src += 8;
tmp |= low_word;
dst += 8;
}
if ( count & 7 ) { /* Store tmp completly, and possibly read one more word.*/
__asm__("stq_u %1,%0":"=m" (*(unsigned long*)(dst)):"r" (tmp));
__asm__("ldq_u %0,%1":"=r" (tmp):"m" (*((unsigned long *)(last_read)) ));
dst += 8;
__asm__("extql %1,%2,%0"
:"=r" (low_word)
:"r" (high_word), "r" ((unsigned long)src));
__asm__("extqh %1,%2,%0"
:"=r" (tmp)
:"r" (tmp), "r" ((unsigned long)src));
tmp |= low_word;
__asm__("ldq_u %0,%1":"=r" (org):"m" (*(unsigned long *)(dst)));
__asm__("mskql %1,%2,%0"
:"=r" (tmp)
:"r" (tmp), "r" (count));
__asm__("mskqh %1,%2,%0"
:"=r" (org)
:"r" (org), "r" (count));
tmp |= org;
}
__asm__("stq_u %1,%0":"=m" (*(unsigned long*)(dst)):"r" (tmp));
return;
}
else { /* count > 8 */
__asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(src+8)));
__asm__("extql %1,%2,%0"
:"=r" (low_word)
:"r" (low_word), "r" ((unsigned long)(src)));
__asm__("extqh %1,%2,%0"
:"=r" (tmp)
:"r" (high_word), "r" ((unsigned long)(src)));
tmp |= low_word;
if( count < 8 ) {
/* only works if always count < 8! */
mask = -1;
__asm__("mskqh %1,%2,%0"
:"=r" (mask)
:"r" (mask), "r" (dst));
__asm__("mskql %1,%2,%0"
:"=r" (mask)
:"r" (mask), "r" (count));
tmp = (tmp & mask) | (org & ~mask);
__asm__("stq_u %1,%0":"=m" (*(unsigned long*)(dst)):"r" (tmp));
return;
}
else {
__asm__("mskqh %1,%2,%0"
:"=r" (tmp)
:"r" (tmp), "r" (dst));
__asm__("mskql %1,%2,%0"
:"=r" (org2)
:"r" (org), "r" (dst));
tmp |= org2;
__asm__("stq_u %1,%0":"=m" (*(unsigned long*)(dst)):"r" (tmp));
return;
}
}
}
static inline void
__memcpy_aligned(unsigned long dst, unsigned long src, long count)
{
unsigned long tmp,org,rm,loop;
rm = dst & 7;
count = count + rm;
__asm__("ldq_u %0,%1":"=r" (tmp):"m" (*(unsigned long*)(src)));
#if 0 /* optimize for aligned quadword moves */
if( !(rm || count & 7)) {
loop = (count-8) >> 3;
while (loop){ /* while( count >= 8 )*/
__asm__("stq_u %1,%0":"=m" (*(unsigned long*)(dst)):"r" (tmp));
dst += 8;
src += 8;
__asm__("ldq_u %0,%1":"=r" (tmp):"m" (*(unsigned long*)(src)));
loop--; /* count -= 8 */
}
__asm__("stq_u %1,%0":"=m" (*(unsigned long*)(dst)):"r" (tmp));
return;
}
#endif
__asm__("ldq_u %0,%1":"=r" (org):"m" (*(unsigned long *)(dst)));
__asm__("mskqh %1,%2,%0"
:"=r" (tmp)
:"r" (tmp), "r" (dst));
__asm__("mskql %1,%2,%0"
:"=r" (org)
:"r" (org), "r" (dst));
tmp |= org;
loop = (count-1) >> 3;
while (loop){ /* while( count > 8 )*/
__asm__("stq_u %1,%0":"=m" (*(unsigned long*)(dst)):"r" (tmp));
dst += 8;
src += 8;
__asm__("ldq_u %0,%1":"=r" (tmp):"m" (*(unsigned long*)(src)));
loop--; /* count -= 8 */
}
if ( count & 7) {
__asm__("ldq_u %0,%1":"=r" (org):"m" (*(unsigned long *)(dst)));
__asm__("mskql %1,%2,%0"
:"=r" (tmp)
:"r" (tmp), "r" (count));
__asm__("mskqh %1,%2,%0"
:"=r" (org)
:"r" (org), "r" (count));
tmp |= org;
}
__asm__("stq_u %1,%0":"=m" (*(unsigned long*)(dst)):"r" (tmp));
}
void *
__memcpy(void * dest, const void *src, size_t n)
{
if(!n) return dest;
if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
__memcpy_aligned((unsigned long) dest, (unsigned long) src, n);
return dest;
}
__memcpy_unaligned((unsigned long) dest, (unsigned long) src, n);
return dest;
}
#ifdef __KERNEL__
/*
* Broken compiler uses "bcopy" to do internal
* assignments. Silly OSF/1 BSDism.
*/
char *
bcopy(const char * src, char * dest, size_t n)
{
__memcpy(dest, src, n);
return dest;
}
/*
* gcc-2.7.1 and newer generate calls to memset and memcpy. So we
* need to define that here:
*/
asm (".weakext memcpy, __memcpy");
#endif
-- Martin Ostermann | mailto:ost@comnets.rwth-aachen.de Communication Networks | http://www.comnets.rwth-aachen.de/~ost Aachen University of Technology | phoneto:++49/241/807917 Germany | faxto:++49/241/8890378
-- To unsubscribe: mail -s unsubscribe axp-list-request@redhat.com < /dev/null
Copyright © 1995-1997 Red Hat Software. Legal notices