OSDN Git Service

- fixup asm. No object-code changes
[uclinux-h8/uClibc.git] / libc / string / cris / memcpy.c
1 /* Copyright (C) 2001, 2003 Free Software Foundation, Inc.
2    Copyright (C) 1994, 1995, 2000 Axis Communications AB.
3
4    This file is part of the GNU C Library.
5
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Library General Public License as
8    published by the Free Software Foundation; either version 2 of the
9    License, or (at your option) any later version.
10
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Library General Public License for more details.
15
16    You should have received a copy of the GNU Library General Public
17    License along with the GNU C Library; see the file COPYING.LIB.  If not,
18    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19    Boston, MA 02111-1307, USA.  */
20
21 /*#************************************************************************#*/
22 /*#-------------------------------------------------------------------------*/
23 /*#                                                                         */
24 /*# FUNCTION NAME: memcpy()                                                 */
25 /*#                                                                         */
26 /*# PARAMETERS:  void* dst;   Destination address.                          */
27 /*#              void* src;   Source address.                               */
28 /*#              int   len;   Number of bytes to copy.                      */
29 /*#                                                                         */
30 /*# RETURNS:     dst.                                                       */
31 /*#                                                                         */
32 /*# DESCRIPTION: Copies len bytes of memory from src to dst.  No guarantees */
33 /*#              about copying of overlapping memory areas. This routine is */
34 /*#              very sensitive to compiler changes in register allocation. */
35 /*#              Should really be rewritten to avoid this problem.          */
36 /*#                                                                         */
37 /*#-------------------------------------------------------------------------*/
38 /*#                                                                         */
39 /*# HISTORY                                                                 */
40 /*#                                                                         */
41 /*# DATE      NAME            CHANGES                                       */
42 /*# ----      ----            -------                                       */
43 /*# 941007    Kenny R         Creation                                      */
44 /*# 941011    Kenny R         Lots of optimizations and inlining.           */
45 /*# 941129    Ulf A           Adapted for use in libc.                      */
46 /*# 950216    HP              N==0 forgotten if non-aligned src/dst.        */
47 /*#                           Added some optimizations.                     */
48 /*# 001025    HP              Make src and dst char *.  Align dst to        */
49 /*#                           dword, not just word-if-both-src-and-dst-     */
50 /*#                           are-misaligned.                               */
51 /*# 070806    RW              Modified for uClibc                           */
52 /*#                           (__arch_v32 -> __CONFIG_CRISV32__,            */
53 /*#                           include features.h to reach it.)              */
54 /*#                                                                         */
55 /*#-------------------------------------------------------------------------*/
56
57 #include <features.h>
58
59 #ifdef __CONFIG_CRISV32__
60 /* For CRISv32, movem is very cheap.  */
61 #define MEMCPY_BLOCK_THRESHOLD (44)
62 #else
63 /* Break even between movem and move16 is at 38.7*2, but modulo 44. */
64 #define MEMCPY_BLOCK_THRESHOLD (44*2)
65 #endif
66
67 void *memcpy(void *, const void *, unsigned int);
68
69 libc_hidden_proto(memcpy)
70 void *memcpy(void *pdst,
71              const void *psrc,
72              unsigned int pn)
73 {
74   /* Ok.  Now we want the parameters put in special registers.
75      Make sure the compiler is able to make something useful of this.
76       As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
77
78      If gcc was allright, it really would need no temporaries, and no
79      stack space to save stuff on. */
80
81 #ifndef MEMPCPY
82   register void *return_dst __asm__ ("r10") = pdst;
83 #else
84   /* FIXME: Use R10 for something.  */
85 # define return_dst dst
86 #endif
87
88   register char *dst __asm__ ("r13") = pdst;
89   register char *src __asm__ ("r11") = (char *) psrc;
90   register int n __asm__ ("r12") = pn;
91   
92  
93   /* When src is aligned but not dst, this makes a few extra needless
94      cycles.  I believe it would take as many to check that the
95      re-alignment was unnecessary.  */
96   if (((unsigned long) dst & 3) != 0
97       /* Don't align if we wouldn't copy more than a few bytes; so we
98          don't have to check further for overflows.  */
99       && n >= 3)
100   {
101     if ((unsigned long) dst & 1)
102     {
103       n--;
104       *(char*)dst = *(char*)src;
105       src++;
106       dst++;
107     }
108
109     if ((unsigned long) dst & 2)
110     {
111       n -= 2;
112       *(short*)dst = *(short*)src;
113       src += 2;
114       dst += 2;
115     }
116   }
117
118   /* Decide which copying method to use. */
119   if (n >= MEMCPY_BLOCK_THRESHOLD)
120   {
121     /* For large copies we use 'movem' */
122
123   /* It is not optimal to tell the compiler about clobbering any
124      registers; that will move the saving/restoring of those registers
125      to the function prologue/epilogue, and make non-movem sizes
126      suboptimal.
127
128       This method is not foolproof; it assumes that the "register asm"
129      declarations at the beginning of the function really are used
130      here (beware: they may be moved to temporary registers).
131       This way, we do not have to save/move the registers around into
132      temporaries; we can safely use them straight away.  */
133     __asm__ __volatile__ ("\
134         .syntax no_register_prefix                                      \n\
135                                                                         \n\
136         ;; Check that the register asm declaration got right.           \n\
137         ;; The GCC manual explicitly says TRT will happen.              \n\
138         .ifnc %0-%1-%2,$r13-$r11-$r12                                   \n\
139         .err                                                            \n\
140         .endif                                                          \n\
141                                                                         \n\
142         ;; Save the registers we'll use in the movem process            \n\
143         ;; on the stack.                                                \n\
144         subq    11*4,sp                                                 \n\
145         movem   r10,[sp]                                                \n\
146                                                                         \n\
147         ;; Now we've got this:                                          \n\
148         ;; r11 - src                                                    \n\
149         ;; r13 - dst                                                    \n\
150         ;; r12 - n                                                      \n\
151                                                                         \n\
152         ;; Update n for the first loop                                  \n\
153         subq    44,r12                                                  \n\
154 0:                                                                      \n\
155         movem   [r11+],r10                                              \n\
156         subq   44,r12                                                   \n\
157         bge     0b                                                      \n\
158         movem   r10,[r13+]                                              \n\
159                                                                         \n\
160         addq   44,r12  ;; compensate for last loop underflowing n       \n\
161                                                                         \n\
162         ;; Restore registers from stack                                 \n\
163         movem [sp+],r10"
164
165      /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n) 
166      /* Inputs */ : "0" (dst), "1" (src), "2" (n));
167   }
168
169   /* Either we directly starts copying, using dword copying
170      in a loop, or we copy as much as possible with 'movem' 
171      and then the last block (<44 bytes) is copied here.
172      This will work since 'movem' will have updated src,dst,n. */
173
174   while ( n >= 16 )
175   {
176     *((long*)dst)++ = *((long*)src)++;
177     *((long*)dst)++ = *((long*)src)++;
178     *((long*)dst)++ = *((long*)src)++;
179     *((long*)dst)++ = *((long*)src)++;
180     n -= 16;
181   }
182
183   /* A switch() is definitely the fastest although it takes a LOT of code.
184    * Particularly if you inline code this.
185    */
186   switch (n)
187   {
188     case 0:
189       break;
190     case 1:
191       *((char*)dst)++ = *((char*)src)++;
192       break;
193     case 2:
194       *((short*)dst)++ = *((short*)src)++;
195       break;
196     case 3:
197       *((short*)dst)++ = *((short*)src)++;
198       *((char*)dst)++ = *((char*)src)++;
199       break;
200     case 4:
201       *((long*)dst)++ = *((long*)src)++;
202       break;
203     case 5:
204       *((long*)dst)++ = *((long*)src)++;
205       *((char*)dst)++ = *((char*)src)++;
206       break;
207     case 6:
208       *((long*)dst)++ = *((long*)src)++;
209       *((short*)dst)++ = *((short*)src)++;
210       break;
211     case 7:
212       *((long*)dst)++ = *((long*)src)++;
213       *((short*)dst)++ = *((short*)src)++;
214       *((char*)dst)++ = *((char*)src)++;
215       break;
216     case 8:
217       *((long*)dst)++ = *((long*)src)++;
218       *((long*)dst)++ = *((long*)src)++;
219       break;
220     case 9:
221       *((long*)dst)++ = *((long*)src)++;
222       *((long*)dst)++ = *((long*)src)++;
223       *((char*)dst)++ = *((char*)src)++;
224       break;
225     case 10:
226       *((long*)dst)++ = *((long*)src)++;
227       *((long*)dst)++ = *((long*)src)++;
228       *((short*)dst)++ = *((short*)src)++;
229       break;
230     case 11:
231       *((long*)dst)++ = *((long*)src)++;
232       *((long*)dst)++ = *((long*)src)++;
233       *((short*)dst)++ = *((short*)src)++;
234       *((char*)dst)++ = *((char*)src)++;
235       break;
236     case 12:
237       *((long*)dst)++ = *((long*)src)++;
238       *((long*)dst)++ = *((long*)src)++;
239       *((long*)dst)++ = *((long*)src)++;
240       break;
241     case 13:
242       *((long*)dst)++ = *((long*)src)++;
243       *((long*)dst)++ = *((long*)src)++;
244       *((long*)dst)++ = *((long*)src)++;
245       *((char*)dst)++ = *((char*)src)++;
246       break;
247     case 14:
248       *((long*)dst)++ = *((long*)src)++;
249       *((long*)dst)++ = *((long*)src)++;
250       *((long*)dst)++ = *((long*)src)++;
251       *((short*)dst)++ = *((short*)src)++;
252       break;
253     case 15:
254       *((long*)dst)++ = *((long*)src)++;
255       *((long*)dst)++ = *((long*)src)++;
256       *((long*)dst)++ = *((long*)src)++;
257       *((short*)dst)++ = *((short*)src)++;
258       *((char*)dst)++ = *((char*)src)++;
259       break;
260   }
261
262   return return_dst; /* destination pointer. */
263 } /* memcpy() */
264 libc_hidden_def(memcpy)