OSDN Git Service

2013.10.24
[uclinux-h8/uClinux-dist.git] / freeswan / libcrypto / libaes / asm / aes-i586.S
1 //
2 // Copyright (c) 2001, Dr Brian Gladman <brg@gladman.uk.net>, Worcester, UK.
3 // All rights reserved.
4 //
5 // TERMS
6 //
7 //  Redistribution and use in source and binary forms, with or without
8 //  modification, are permitted subject to the following conditions:
9 //
10 //  1. Redistributions of source code must retain the above copyright
11 //     notice, this list of conditions and the following disclaimer.
12 //
13 //  2. Redistributions in binary form must reproduce the above copyright
14 //     notice, this list of conditions and the following disclaimer in the
15 //     documentation and/or other materials provided with the distribution.
16 //
17 //  3. The copyright holder's name must not be used to endorse or promote
18 //     any products derived from this software without his specific prior
19 //     written permission.
20 //
21 //  This software is provided 'as is' with no express or implied warranties
22 //  of correctness or fitness for purpose.
23
24 // Modified by Jari Ruusu,  December 24 2001
25 //  - Converted syntax to GNU CPP/assembler syntax
26 //  - C programming interface converted back to "old" API
27 //  - Minor portability cleanups and speed optimizations
28
29 // An AES (Rijndael) implementation for the Pentium. This version only
30 // implements the standard AES block length (128 bits, 16 bytes). This code
31 // does not preserve the eax, ecx or edx registers or the artihmetic status
32 // flags. However, the ebx, esi, edi, and ebp registers are preserved across
33 // calls.
34
35 // void aes_set_key(aes_context *cx, const unsigned char key[], const int key_len, const int f)
36 // void aes_encrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])
37 // void aes_decrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])
38
39 #if defined(USE_UNDERLINE)
40 # define aes_set_key _aes_set_key
41 # define aes_encrypt _aes_encrypt
42 # define aes_decrypt _aes_decrypt
43 #endif
44 #if !defined(ALIGN32BYTES)
45 # define ALIGN32BYTES 32
46 #endif
47
48         .file   "aes-i586.S"
49         .globl  aes_set_key
50         .globl  aes_encrypt
51         .globl  aes_decrypt
52
53 #define tlen    1024    // length of each of 4 'xor' arrays (256 32-bit words)
54
55 // offsets to parameters with one register pushed onto stack
56
57 #define ctx     8       // AES context structure
58 #define in_blk  12      // input byte array address parameter
59 #define out_blk 16      // output byte array address parameter
60
61 // offsets in context structure
62
63 #define nkey    0       // key length, size 4
64 #define nrnd    4       // number of rounds, size 4
65 #define ekey    8       // encryption key schedule base address, size 256
66 #define dkey    264     // decryption key schedule base address, size 256
67
68 // This macro performs a forward encryption cycle. It is entered with
69 // the first previous round column values in %eax, %ebx, %esi and %edi and
70 // exits with the final values in the same registers.
71
72 #define fwd_rnd(p1,p2)                   \
73         mov     %ebx,(%esp)             ;\
74         movzbl  %al,%edx                ;\
75         mov     %eax,%ecx               ;\
76         mov     p2(%ebp),%eax           ;\
77         mov     %edi,4(%esp)            ;\
78         mov     p2+12(%ebp),%edi        ;\
79         xor     p1(,%edx,4),%eax        ;\
80         movzbl  %ch,%edx                ;\
81         shr     $16,%ecx                ;\
82         mov     p2+4(%ebp),%ebx         ;\
83         xor     p1+tlen(,%edx,4),%edi   ;\
84         movzbl  %cl,%edx                ;\
85         movzbl  %ch,%ecx                ;\
86         xor     p1+3*tlen(,%ecx,4),%ebx ;\
87         mov     %esi,%ecx               ;\
88         mov     p1+2*tlen(,%edx,4),%esi ;\
89         movzbl  %cl,%edx                ;\
90         xor     p1(,%edx,4),%esi        ;\
91         movzbl  %ch,%edx                ;\
92         shr     $16,%ecx                ;\
93         xor     p1+tlen(,%edx,4),%ebx   ;\
94         movzbl  %cl,%edx                ;\
95         movzbl  %ch,%ecx                ;\
96         xor     p1+2*tlen(,%edx,4),%eax ;\
97         mov     (%esp),%edx             ;\
98         xor     p1+3*tlen(,%ecx,4),%edi ;\
99         movzbl  %dl,%ecx                ;\
100         xor     p2+8(%ebp),%esi         ;\
101         xor     p1(,%ecx,4),%ebx        ;\
102         movzbl  %dh,%ecx                ;\
103         shr     $16,%edx                ;\
104         xor     p1+tlen(,%ecx,4),%eax   ;\
105         movzbl  %dl,%ecx                ;\
106         movzbl  %dh,%edx                ;\
107         xor     p1+2*tlen(,%ecx,4),%edi ;\
108         mov     4(%esp),%ecx            ;\
109         xor     p1+3*tlen(,%edx,4),%esi ;\
110         movzbl  %cl,%edx                ;\
111         xor     p1(,%edx,4),%edi        ;\
112         movzbl  %ch,%edx                ;\
113         shr     $16,%ecx                ;\
114         xor     p1+tlen(,%edx,4),%esi   ;\
115         movzbl  %cl,%edx                ;\
116         movzbl  %ch,%ecx                ;\
117         xor     p1+2*tlen(,%edx,4),%ebx ;\
118         xor     p1+3*tlen(,%ecx,4),%eax
119
120 // This macro performs an inverse encryption cycle. It is entered with
121 // the first previous round column values in %eax, %ebx, %esi and %edi and
122 // exits with the final values in the same registers.
123
124 #define inv_rnd(p1,p2)                   \
125         movzbl  %al,%edx                ;\
126         mov     %ebx,(%esp)             ;\
127         mov     %eax,%ecx               ;\
128         mov     p2(%ebp),%eax           ;\
129         mov     %edi,4(%esp)            ;\
130         mov     p2+4(%ebp),%ebx         ;\
131         xor     p1(,%edx,4),%eax        ;\
132         movzbl  %ch,%edx                ;\
133         shr     $16,%ecx                ;\
134         mov     p2+12(%ebp),%edi        ;\
135         xor     p1+tlen(,%edx,4),%ebx   ;\
136         movzbl  %cl,%edx                ;\
137         movzbl  %ch,%ecx                ;\
138         xor     p1+3*tlen(,%ecx,4),%edi ;\
139         mov     %esi,%ecx               ;\
140         mov     p1+2*tlen(,%edx,4),%esi ;\
141         movzbl  %cl,%edx                ;\
142         xor     p1(,%edx,4),%esi        ;\
143         movzbl  %ch,%edx                ;\
144         shr     $16,%ecx                ;\
145         xor     p1+tlen(,%edx,4),%edi   ;\
146         movzbl  %cl,%edx                ;\
147         movzbl  %ch,%ecx                ;\
148         xor     p1+2*tlen(,%edx,4),%eax ;\
149         mov     (%esp),%edx             ;\
150         xor     p1+3*tlen(,%ecx,4),%ebx ;\
151         movzbl  %dl,%ecx                ;\
152         xor     p2+8(%ebp),%esi         ;\
153         xor     p1(,%ecx,4),%ebx        ;\
154         movzbl  %dh,%ecx                ;\
155         shr     $16,%edx                ;\
156         xor     p1+tlen(,%ecx,4),%esi   ;\
157         movzbl  %dl,%ecx                ;\
158         movzbl  %dh,%edx                ;\
159         xor     p1+2*tlen(,%ecx,4),%edi ;\
160         mov     4(%esp),%ecx            ;\
161         xor     p1+3*tlen(,%edx,4),%eax ;\
162         movzbl  %cl,%edx                ;\
163         xor     p1(,%edx,4),%edi        ;\
164         movzbl  %ch,%edx                ;\
165         shr     $16,%ecx                ;\
166         xor     p1+tlen(,%edx,4),%eax   ;\
167         movzbl  %cl,%edx                ;\
168         movzbl  %ch,%ecx                ;\
169         xor     p1+2*tlen(,%edx,4),%ebx ;\
170         xor     p1+3*tlen(,%ecx,4),%esi
171
172 // AES (Rijndael) Encryption Subroutine
173
174         .text
175         .align  ALIGN32BYTES
176 aes_encrypt:
177         push    %ebp
178         mov     ctx(%esp),%ebp          // pointer to context
179         mov     in_blk(%esp),%ecx
180         push    %ebx
181         push    %esi
182         push    %edi
183         mov     nrnd(%ebp),%edx         // number of rounds
184         lea     ekey+16(%ebp),%ebp      // key pointer
185
186 // input four columns and xor in first round key
187
188         mov     (%ecx),%eax
189         mov     4(%ecx),%ebx
190         mov     8(%ecx),%esi
191         mov     12(%ecx),%edi
192         xor     -16(%ebp),%eax
193         xor     -12(%ebp),%ebx
194         xor     -8(%ebp),%esi
195         xor     -4(%ebp),%edi
196
197         sub     $8,%esp                 // space for register saves on stack
198
199         sub     $10,%edx
200         je      aes_15
201         add     $32,%ebp
202         sub     $2,%edx
203         je      aes_13
204         add     $32,%ebp
205
206         fwd_rnd(aes_ft_tab,-64)         // 14 rounds for 256-bit key
207         fwd_rnd(aes_ft_tab,-48)
208 aes_13: fwd_rnd(aes_ft_tab,-32)         // 12 rounds for 192-bit key
209         fwd_rnd(aes_ft_tab,-16)
210 aes_15: fwd_rnd(aes_ft_tab,0)           // 10 rounds for 128-bit key
211         fwd_rnd(aes_ft_tab,16)
212         fwd_rnd(aes_ft_tab,32)
213         fwd_rnd(aes_ft_tab,48)
214         fwd_rnd(aes_ft_tab,64)
215         fwd_rnd(aes_ft_tab,80)
216         fwd_rnd(aes_ft_tab,96)
217         fwd_rnd(aes_ft_tab,112)
218         fwd_rnd(aes_ft_tab,128)
219         fwd_rnd(aes_fl_tab,144)         // last round uses a different table
220
221 // move final values to the output array.
222
223         mov     out_blk+20(%esp),%ebp
224         add     $8,%esp
225         mov     %eax,(%ebp)
226         mov     %ebx,4(%ebp)
227         mov     %esi,8(%ebp)
228         mov     %edi,12(%ebp)
229         pop     %edi
230         pop     %esi
231         pop     %ebx
232         pop     %ebp
233         ret
234
235
236 // AES (Rijndael) Decryption Subroutine
237
238         .align  ALIGN32BYTES
239 aes_decrypt:
240         push    %ebp
241         mov     ctx(%esp),%ebp          // pointer to context
242         mov     in_blk(%esp),%ecx
243         push    %ebx
244         push    %esi
245         push    %edi
246         mov     nrnd(%ebp),%edx         // number of rounds
247         lea     dkey+16(%ebp),%ebp      // key pointer
248
249 // input four columns and xor in first round key
250
251         mov     (%ecx),%eax
252         mov     4(%ecx),%ebx
253         mov     8(%ecx),%esi
254         mov     12(%ecx),%edi
255         xor     -16(%ebp),%eax
256         xor     -12(%ebp),%ebx
257         xor     -8(%ebp),%esi
258         xor     -4(%ebp),%edi
259
260         sub     $8,%esp                 // space for register saves on stack
261
262         sub     $10,%edx
263         je      aes_25
264         add     $32,%ebp
265         sub     $2,%edx
266         je      aes_23
267         add     $32,%ebp
268
269         inv_rnd(aes_it_tab,-64)         // 14 rounds for 256-bit key
270         inv_rnd(aes_it_tab,-48)
271 aes_23: inv_rnd(aes_it_tab,-32)         // 12 rounds for 192-bit key
272         inv_rnd(aes_it_tab,-16)
273 aes_25: inv_rnd(aes_it_tab,0)           // 10 rounds for 128-bit key
274         inv_rnd(aes_it_tab,16)
275         inv_rnd(aes_it_tab,32)
276         inv_rnd(aes_it_tab,48)
277         inv_rnd(aes_it_tab,64)
278         inv_rnd(aes_it_tab,80)
279         inv_rnd(aes_it_tab,96)
280         inv_rnd(aes_it_tab,112)
281         inv_rnd(aes_it_tab,128)
282         inv_rnd(aes_il_tab,144)         // last round uses a different table
283
284 // move final values to the output array.
285
286         mov     out_blk+20(%esp),%ebp
287         add     $8,%esp
288         mov     %eax,(%ebp)
289         mov     %ebx,4(%ebp)
290         mov     %esi,8(%ebp)
291         mov     %edi,12(%ebp)
292         pop     %edi
293         pop     %esi
294         pop     %ebx
295         pop     %ebp
296         ret
297
298 // AES (Rijndael) Key Schedule Subroutine
299
300 // input/output parameters
301
302 #define aes_cx  12      // AES context
303 #define in_key  16      // key input array address
304 #define key_ln  20      // key length, bytes (16,24,32) or bits (128,192,256)
305 #define ed_flg  24      // 0=create both encr/decr keys, 1=create encr key only
306
307 // offsets for locals
308
309 #define cnt     -4
310 #define kpf     -8
311 #define slen    8
312
313 // This macro performs a column mixing operation on an input 32-bit
314 // word to give a 32-bit result. It uses each of the 4 bytes in the
315 // the input column to index 4 different tables of 256 32-bit words
316 // that are xored together to form the output value.
317
318 #define mix_col(p1)                      \
319         movzbl  %bl,%ecx                ;\
320         mov     p1(,%ecx,4),%eax        ;\
321         movzbl  %bh,%ecx                ;\
322         ror     $16,%ebx                ;\
323         xor     p1+tlen(,%ecx,4),%eax   ;\
324         movzbl  %bl,%ecx                ;\
325         xor     p1+2*tlen(,%ecx,4),%eax ;\
326         movzbl  %bh,%ecx                ;\
327         xor     p1+3*tlen(,%ecx,4),%eax
328
329 // Key Schedule Macros
330
331 #define ksc4(p1)                         \
332         rol     $24,%ebx                ;\
333         mix_col(aes_fl_tab)             ;\
334         ror     $8,%ebx                 ;\
335         xor     4*p1+aes_rcon_tab,%eax  ;\
336         xor     %eax,%esi               ;\
337         xor     %esi,%ebp               ;\
338         mov     %esi,16*p1(%edi)        ;\
339         mov     %ebp,16*p1+4(%edi)      ;\
340         xor     %ebp,%edx               ;\
341         xor     %edx,%ebx               ;\
342         mov     %edx,16*p1+8(%edi)      ;\
343         mov     %ebx,16*p1+12(%edi)
344
345 #define ksc6(p1)                         \
346         rol     $24,%ebx                ;\
347         mix_col(aes_fl_tab)             ;\
348         ror     $8,%ebx                 ;\
349         xor     4*p1+aes_rcon_tab,%eax  ;\
350         xor     24*p1-24(%edi),%eax     ;\
351         mov     %eax,24*p1(%edi)        ;\
352         xor     24*p1-20(%edi),%eax     ;\
353         mov     %eax,24*p1+4(%edi)      ;\
354         xor     %eax,%esi               ;\
355         xor     %esi,%ebp               ;\
356         mov     %esi,24*p1+8(%edi)      ;\
357         mov     %ebp,24*p1+12(%edi)     ;\
358         xor     %ebp,%edx               ;\
359         xor     %edx,%ebx               ;\
360         mov     %edx,24*p1+16(%edi)     ;\
361         mov     %ebx,24*p1+20(%edi)
362
363 #define ksc8(p1)                         \
364         rol     $24,%ebx                ;\
365         mix_col(aes_fl_tab)             ;\
366         ror     $8,%ebx                 ;\
367         xor     4*p1+aes_rcon_tab,%eax  ;\
368         xor     32*p1-32(%edi),%eax     ;\
369         mov     %eax,32*p1(%edi)        ;\
370         xor     32*p1-28(%edi),%eax     ;\
371         mov     %eax,32*p1+4(%edi)      ;\
372         xor     32*p1-24(%edi),%eax     ;\
373         mov     %eax,32*p1+8(%edi)      ;\
374         xor     32*p1-20(%edi),%eax     ;\
375         mov     %eax,32*p1+12(%edi)     ;\
376         push    %ebx                    ;\
377         mov     %eax,%ebx               ;\
378         mix_col(aes_fl_tab)             ;\
379         pop     %ebx                    ;\
380         xor     %eax,%esi               ;\
381         xor     %esi,%ebp               ;\
382         mov     %esi,32*p1+16(%edi)     ;\
383         mov     %ebp,32*p1+20(%edi)     ;\
384         xor     %ebp,%edx               ;\
385         xor     %edx,%ebx               ;\
386         mov     %edx,32*p1+24(%edi)     ;\
387         mov     %ebx,32*p1+28(%edi)
388
389         .align  ALIGN32BYTES
390 aes_set_key:
391         pushfl
392         push    %ebp
393         mov     %esp,%ebp
394         sub     $slen,%esp
395         push    %ebx
396         push    %esi
397         push    %edi
398
399         mov     aes_cx(%ebp),%edx       // edx -> AES context
400
401         mov     key_ln(%ebp),%ecx       // key length
402         cmpl    $128,%ecx
403         jb      aes_30
404         shr     $3,%ecx
405 aes_30: cmpl    $32,%ecx
406         je      aes_32
407         cmpl    $24,%ecx
408         je      aes_32
409         mov     $16,%ecx
410 aes_32: shr     $2,%ecx
411         mov     %ecx,nkey(%edx)
412
413         lea     6(%ecx),%eax            // 10/12/14 for 4/6/8 32-bit key length
414         mov     %eax,nrnd(%edx)
415
416         mov     in_key(%ebp),%esi       // key input array
417         lea     ekey(%edx),%edi         // key position in AES context
418         cld
419         push    %ebp
420         mov     %ecx,%eax               // save key length in eax
421         rep ;   movsl                   // words in the key schedule
422         mov     -4(%esi),%ebx           // put some values in registers
423         mov     -8(%esi),%edx           // to allow faster code
424         mov     -12(%esi),%ebp
425         mov     -16(%esi),%esi
426
427         cmpl    $4,%eax                 // jump on key size
428         je      aes_36
429         cmpl    $6,%eax
430         je      aes_35
431
432         ksc8(0)
433         ksc8(1)
434         ksc8(2)
435         ksc8(3)
436         ksc8(4)
437         ksc8(5)
438         ksc8(6)
439         jmp     aes_37
440 aes_35: ksc6(0)
441         ksc6(1)
442         ksc6(2)
443         ksc6(3)
444         ksc6(4)
445         ksc6(5)
446         ksc6(6)
447         ksc6(7)
448         jmp     aes_37
449 aes_36: ksc4(0)
450         ksc4(1)
451         ksc4(2)
452         ksc4(3)
453         ksc4(4)
454         ksc4(5)
455         ksc4(6)
456         ksc4(7)
457         ksc4(8)
458         ksc4(9)
459 aes_37: pop     %ebp
460         mov     aes_cx(%ebp),%edx       // edx -> AES context
461         cmpl    $0,ed_flg(%ebp)
462         jne     aes_39
463
464 // compile decryption key schedule from encryption schedule - reverse
465 // order and do mix_column operation on round keys except first and last
466
467         mov     nrnd(%edx),%eax         // kt = cx->d_key + nc * cx->Nrnd
468         shl     $2,%eax
469         lea     dkey(%edx,%eax,4),%edi
470         lea     ekey(%edx),%esi         // kf = cx->e_key
471
472         movsl                           // copy first round key (unmodified)
473         movsl
474         movsl
475         movsl
476         sub     $32,%edi
477         movl    $1,cnt(%ebp)
478 aes_38:                                 // do mix column on each column of
479         lodsl                           // each round key
480         mov     %eax,%ebx
481         mix_col(aes_im_tab)
482         stosl
483         lodsl
484         mov     %eax,%ebx
485         mix_col(aes_im_tab)
486         stosl
487         lodsl
488         mov     %eax,%ebx
489         mix_col(aes_im_tab)
490         stosl
491         lodsl
492         mov     %eax,%ebx
493         mix_col(aes_im_tab)
494         stosl
495         sub     $32,%edi
496
497         incl    cnt(%ebp)
498         mov     cnt(%ebp),%eax
499         cmp     nrnd(%edx),%eax
500         jb      aes_38
501
502         movsl                           // copy last round key (unmodified)
503         movsl
504         movsl
505         movsl
506 aes_39: pop     %edi
507         pop     %esi
508         pop     %ebx
509         mov     %ebp,%esp
510         pop     %ebp
511         popfl
512         ret
513
514
515 // finite field multiplies by {02}, {04} and {08}
516
517 #define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
518 #define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
519 #define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
520
521 // finite field multiplies required in table generation
522
523 #define f3(x)   (f2(x) ^ x)
524 #define f9(x)   (f8(x) ^ x)
525 #define fb(x)   (f8(x) ^ f2(x) ^ x)
526 #define fd(x)   (f8(x) ^ f4(x) ^ x)
527 #define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
528
529 // These defines generate the forward table entries
530
531 #define u0(x)   ((f3(x) << 24) | (x << 16) | (x << 8) | f2(x))
532 #define u1(x)   ((x << 24) | (x << 16) | (f2(x) << 8) | f3(x))
533 #define u2(x)   ((x << 24) | (f2(x) << 16) | (f3(x) << 8) | x)
534 #define u3(x)   ((f2(x) << 24) | (f3(x) << 16) | (x << 8) | x)
535
536 // These defines generate the inverse table entries
537
538 #define v0(x)   ((fb(x) << 24) | (fd(x) << 16) | (f9(x) << 8) | fe(x))
539 #define v1(x)   ((fd(x) << 24) | (f9(x) << 16) | (fe(x) << 8) | fb(x))
540 #define v2(x)   ((f9(x) << 24) | (fe(x) << 16) | (fb(x) << 8) | fd(x))
541 #define v3(x)   ((fe(x) << 24) | (fb(x) << 16) | (fd(x) << 8) | f9(x))
542
543 // These defines generate entries for the last round tables
544
545 #define w0(x)   (x)
546 #define w1(x)   (x <<  8)
547 #define w2(x)   (x << 16)
548 #define w3(x)   (x << 24)
549
550 // macro to generate inverse mix column tables (needed for the key schedule)
551
552 #define im_data0(p1) \
553         .long   p1(0x00),p1(0x01),p1(0x02),p1(0x03),p1(0x04),p1(0x05),p1(0x06),p1(0x07) ;\
554         .long   p1(0x08),p1(0x09),p1(0x0a),p1(0x0b),p1(0x0c),p1(0x0d),p1(0x0e),p1(0x0f) ;\
555         .long   p1(0x10),p1(0x11),p1(0x12),p1(0x13),p1(0x14),p1(0x15),p1(0x16),p1(0x17) ;\
556         .long   p1(0x18),p1(0x19),p1(0x1a),p1(0x1b),p1(0x1c),p1(0x1d),p1(0x1e),p1(0x1f)
557 #define im_data1(p1) \
558         .long   p1(0x20),p1(0x21),p1(0x22),p1(0x23),p1(0x24),p1(0x25),p1(0x26),p1(0x27) ;\
559         .long   p1(0x28),p1(0x29),p1(0x2a),p1(0x2b),p1(0x2c),p1(0x2d),p1(0x2e),p1(0x2f) ;\
560         .long   p1(0x30),p1(0x31),p1(0x32),p1(0x33),p1(0x34),p1(0x35),p1(0x36),p1(0x37) ;\
561         .long   p1(0x38),p1(0x39),p1(0x3a),p1(0x3b),p1(0x3c),p1(0x3d),p1(0x3e),p1(0x3f)
562 #define im_data2(p1) \
563         .long   p1(0x40),p1(0x41),p1(0x42),p1(0x43),p1(0x44),p1(0x45),p1(0x46),p1(0x47) ;\
564         .long   p1(0x48),p1(0x49),p1(0x4a),p1(0x4b),p1(0x4c),p1(0x4d),p1(0x4e),p1(0x4f) ;\
565         .long   p1(0x50),p1(0x51),p1(0x52),p1(0x53),p1(0x54),p1(0x55),p1(0x56),p1(0x57) ;\
566         .long   p1(0x58),p1(0x59),p1(0x5a),p1(0x5b),p1(0x5c),p1(0x5d),p1(0x5e),p1(0x5f)
567 #define im_data3(p1) \
568         .long   p1(0x60),p1(0x61),p1(0x62),p1(0x63),p1(0x64),p1(0x65),p1(0x66),p1(0x67) ;\
569         .long   p1(0x68),p1(0x69),p1(0x6a),p1(0x6b),p1(0x6c),p1(0x6d),p1(0x6e),p1(0x6f) ;\
570         .long   p1(0x70),p1(0x71),p1(0x72),p1(0x73),p1(0x74),p1(0x75),p1(0x76),p1(0x77) ;\
571         .long   p1(0x78),p1(0x79),p1(0x7a),p1(0x7b),p1(0x7c),p1(0x7d),p1(0x7e),p1(0x7f)
572 #define im_data4(p1) \
573         .long   p1(0x80),p1(0x81),p1(0x82),p1(0x83),p1(0x84),p1(0x85),p1(0x86),p1(0x87) ;\
574         .long   p1(0x88),p1(0x89),p1(0x8a),p1(0x8b),p1(0x8c),p1(0x8d),p1(0x8e),p1(0x8f) ;\
575         .long   p1(0x90),p1(0x91),p1(0x92),p1(0x93),p1(0x94),p1(0x95),p1(0x96),p1(0x97) ;\
576         .long   p1(0x98),p1(0x99),p1(0x9a),p1(0x9b),p1(0x9c),p1(0x9d),p1(0x9e),p1(0x9f)
577 #define im_data5(p1) \
578         .long   p1(0xa0),p1(0xa1),p1(0xa2),p1(0xa3),p1(0xa4),p1(0xa5),p1(0xa6),p1(0xa7) ;\
579         .long   p1(0xa8),p1(0xa9),p1(0xaa),p1(0xab),p1(0xac),p1(0xad),p1(0xae),p1(0xaf) ;\
580         .long   p1(0xb0),p1(0xb1),p1(0xb2),p1(0xb3),p1(0xb4),p1(0xb5),p1(0xb6),p1(0xb7) ;\
581         .long   p1(0xb8),p1(0xb9),p1(0xba),p1(0xbb),p1(0xbc),p1(0xbd),p1(0xbe),p1(0xbf)
582 #define im_data6(p1) \
583         .long   p1(0xc0),p1(0xc1),p1(0xc2),p1(0xc3),p1(0xc4),p1(0xc5),p1(0xc6),p1(0xc7) ;\
584         .long   p1(0xc8),p1(0xc9),p1(0xca),p1(0xcb),p1(0xcc),p1(0xcd),p1(0xce),p1(0xcf) ;\
585         .long   p1(0xd0),p1(0xd1),p1(0xd2),p1(0xd3),p1(0xd4),p1(0xd5),p1(0xd6),p1(0xd7) ;\
586         .long   p1(0xd8),p1(0xd9),p1(0xda),p1(0xdb),p1(0xdc),p1(0xdd),p1(0xde),p1(0xdf)
587 #define im_data7(p1) \
588         .long   p1(0xe0),p1(0xe1),p1(0xe2),p1(0xe3),p1(0xe4),p1(0xe5),p1(0xe6),p1(0xe7) ;\
589         .long   p1(0xe8),p1(0xe9),p1(0xea),p1(0xeb),p1(0xec),p1(0xed),p1(0xee),p1(0xef) ;\
590         .long   p1(0xf0),p1(0xf1),p1(0xf2),p1(0xf3),p1(0xf4),p1(0xf5),p1(0xf6),p1(0xf7) ;\
591         .long   p1(0xf8),p1(0xf9),p1(0xfa),p1(0xfb),p1(0xfc),p1(0xfd),p1(0xfe),p1(0xff)
592
593 // S-box data - 256 entries
594
595 #define sb_data0(p1) \
596         .long   p1(0x63),p1(0x7c),p1(0x77),p1(0x7b),p1(0xf2),p1(0x6b),p1(0x6f),p1(0xc5) ;\
597         .long   p1(0x30),p1(0x01),p1(0x67),p1(0x2b),p1(0xfe),p1(0xd7),p1(0xab),p1(0x76) ;\
598         .long   p1(0xca),p1(0x82),p1(0xc9),p1(0x7d),p1(0xfa),p1(0x59),p1(0x47),p1(0xf0) ;\
599         .long   p1(0xad),p1(0xd4),p1(0xa2),p1(0xaf),p1(0x9c),p1(0xa4),p1(0x72),p1(0xc0)
600 #define sb_data1(p1) \
601         .long   p1(0xb7),p1(0xfd),p1(0x93),p1(0x26),p1(0x36),p1(0x3f),p1(0xf7),p1(0xcc) ;\
602         .long   p1(0x34),p1(0xa5),p1(0xe5),p1(0xf1),p1(0x71),p1(0xd8),p1(0x31),p1(0x15) ;\
603         .long   p1(0x04),p1(0xc7),p1(0x23),p1(0xc3),p1(0x18),p1(0x96),p1(0x05),p1(0x9a) ;\
604         .long   p1(0x07),p1(0x12),p1(0x80),p1(0xe2),p1(0xeb),p1(0x27),p1(0xb2),p1(0x75)
605 #define sb_data2(p1) \
606         .long   p1(0x09),p1(0x83),p1(0x2c),p1(0x1a),p1(0x1b),p1(0x6e),p1(0x5a),p1(0xa0) ;\
607         .long   p1(0x52),p1(0x3b),p1(0xd6),p1(0xb3),p1(0x29),p1(0xe3),p1(0x2f),p1(0x84) ;\
608         .long   p1(0x53),p1(0xd1),p1(0x00),p1(0xed),p1(0x20),p1(0xfc),p1(0xb1),p1(0x5b) ;\
609         .long   p1(0x6a),p1(0xcb),p1(0xbe),p1(0x39),p1(0x4a),p1(0x4c),p1(0x58),p1(0xcf)
610 #define sb_data3(p1) \
611         .long   p1(0xd0),p1(0xef),p1(0xaa),p1(0xfb),p1(0x43),p1(0x4d),p1(0x33),p1(0x85) ;\
612         .long   p1(0x45),p1(0xf9),p1(0x02),p1(0x7f),p1(0x50),p1(0x3c),p1(0x9f),p1(0xa8) ;\
613         .long   p1(0x51),p1(0xa3),p1(0x40),p1(0x8f),p1(0x92),p1(0x9d),p1(0x38),p1(0xf5) ;\
614         .long   p1(0xbc),p1(0xb6),p1(0xda),p1(0x21),p1(0x10),p1(0xff),p1(0xf3),p1(0xd2)
615 #define sb_data4(p1) \
616         .long   p1(0xcd),p1(0x0c),p1(0x13),p1(0xec),p1(0x5f),p1(0x97),p1(0x44),p1(0x17) ;\
617         .long   p1(0xc4),p1(0xa7),p1(0x7e),p1(0x3d),p1(0x64),p1(0x5d),p1(0x19),p1(0x73) ;\
618         .long   p1(0x60),p1(0x81),p1(0x4f),p1(0xdc),p1(0x22),p1(0x2a),p1(0x90),p1(0x88) ;\
619         .long   p1(0x46),p1(0xee),p1(0xb8),p1(0x14),p1(0xde),p1(0x5e),p1(0x0b),p1(0xdb)
620 #define sb_data5(p1) \
621         .long   p1(0xe0),p1(0x32),p1(0x3a),p1(0x0a),p1(0x49),p1(0x06),p1(0x24),p1(0x5c) ;\
622         .long   p1(0xc2),p1(0xd3),p1(0xac),p1(0x62),p1(0x91),p1(0x95),p1(0xe4),p1(0x79) ;\
623         .long   p1(0xe7),p1(0xc8),p1(0x37),p1(0x6d),p1(0x8d),p1(0xd5),p1(0x4e),p1(0xa9) ;\
624         .long   p1(0x6c),p1(0x56),p1(0xf4),p1(0xea),p1(0x65),p1(0x7a),p1(0xae),p1(0x08)
625 #define sb_data6(p1) \
626         .long   p1(0xba),p1(0x78),p1(0x25),p1(0x2e),p1(0x1c),p1(0xa6),p1(0xb4),p1(0xc6) ;\
627         .long   p1(0xe8),p1(0xdd),p1(0x74),p1(0x1f),p1(0x4b),p1(0xbd),p1(0x8b),p1(0x8a) ;\
628         .long   p1(0x70),p1(0x3e),p1(0xb5),p1(0x66),p1(0x48),p1(0x03),p1(0xf6),p1(0x0e) ;\
629         .long   p1(0x61),p1(0x35),p1(0x57),p1(0xb9),p1(0x86),p1(0xc1),p1(0x1d),p1(0x9e)
630 #define sb_data7(p1) \
631         .long   p1(0xe1),p1(0xf8),p1(0x98),p1(0x11),p1(0x69),p1(0xd9),p1(0x8e),p1(0x94) ;\
632         .long   p1(0x9b),p1(0x1e),p1(0x87),p1(0xe9),p1(0xce),p1(0x55),p1(0x28),p1(0xdf) ;\
633         .long   p1(0x8c),p1(0xa1),p1(0x89),p1(0x0d),p1(0xbf),p1(0xe6),p1(0x42),p1(0x68) ;\
634         .long   p1(0x41),p1(0x99),p1(0x2d),p1(0x0f),p1(0xb0),p1(0x54),p1(0xbb),p1(0x16)
635
636 // Inverse S-box data - 256 entries
637
638 #define ib_data0(p1) \
639         .long   p1(0x52),p1(0x09),p1(0x6a),p1(0xd5),p1(0x30),p1(0x36),p1(0xa5),p1(0x38) ;\
640         .long   p1(0xbf),p1(0x40),p1(0xa3),p1(0x9e),p1(0x81),p1(0xf3),p1(0xd7),p1(0xfb) ;\
641         .long   p1(0x7c),p1(0xe3),p1(0x39),p1(0x82),p1(0x9b),p1(0x2f),p1(0xff),p1(0x87) ;\
642         .long   p1(0x34),p1(0x8e),p1(0x43),p1(0x44),p1(0xc4),p1(0xde),p1(0xe9),p1(0xcb)
643 #define ib_data1(p1) \
644         .long   p1(0x54),p1(0x7b),p1(0x94),p1(0x32),p1(0xa6),p1(0xc2),p1(0x23),p1(0x3d) ;\
645         .long   p1(0xee),p1(0x4c),p1(0x95),p1(0x0b),p1(0x42),p1(0xfa),p1(0xc3),p1(0x4e) ;\
646         .long   p1(0x08),p1(0x2e),p1(0xa1),p1(0x66),p1(0x28),p1(0xd9),p1(0x24),p1(0xb2) ;\
647         .long   p1(0x76),p1(0x5b),p1(0xa2),p1(0x49),p1(0x6d),p1(0x8b),p1(0xd1),p1(0x25)
648 #define ib_data2(p1) \
649         .long   p1(0x72),p1(0xf8),p1(0xf6),p1(0x64),p1(0x86),p1(0x68),p1(0x98),p1(0x16) ;\
650         .long   p1(0xd4),p1(0xa4),p1(0x5c),p1(0xcc),p1(0x5d),p1(0x65),p1(0xb6),p1(0x92) ;\
651         .long   p1(0x6c),p1(0x70),p1(0x48),p1(0x50),p1(0xfd),p1(0xed),p1(0xb9),p1(0xda) ;\
652         .long   p1(0x5e),p1(0x15),p1(0x46),p1(0x57),p1(0xa7),p1(0x8d),p1(0x9d),p1(0x84)
653 #define ib_data3(p1) \
654         .long   p1(0x90),p1(0xd8),p1(0xab),p1(0x00),p1(0x8c),p1(0xbc),p1(0xd3),p1(0x0a) ;\
655         .long   p1(0xf7),p1(0xe4),p1(0x58),p1(0x05),p1(0xb8),p1(0xb3),p1(0x45),p1(0x06) ;\
656         .long   p1(0xd0),p1(0x2c),p1(0x1e),p1(0x8f),p1(0xca),p1(0x3f),p1(0x0f),p1(0x02) ;\
657         .long   p1(0xc1),p1(0xaf),p1(0xbd),p1(0x03),p1(0x01),p1(0x13),p1(0x8a),p1(0x6b)
658 #define ib_data4(p1) \
659         .long   p1(0x3a),p1(0x91),p1(0x11),p1(0x41),p1(0x4f),p1(0x67),p1(0xdc),p1(0xea) ;\
660         .long   p1(0x97),p1(0xf2),p1(0xcf),p1(0xce),p1(0xf0),p1(0xb4),p1(0xe6),p1(0x73) ;\
661         .long   p1(0x96),p1(0xac),p1(0x74),p1(0x22),p1(0xe7),p1(0xad),p1(0x35),p1(0x85) ;\
662         .long   p1(0xe2),p1(0xf9),p1(0x37),p1(0xe8),p1(0x1c),p1(0x75),p1(0xdf),p1(0x6e)
663 #define ib_data5(p1) \
664         .long   p1(0x47),p1(0xf1),p1(0x1a),p1(0x71),p1(0x1d),p1(0x29),p1(0xc5),p1(0x89) ;\
665         .long   p1(0x6f),p1(0xb7),p1(0x62),p1(0x0e),p1(0xaa),p1(0x18),p1(0xbe),p1(0x1b) ;\
666         .long   p1(0xfc),p1(0x56),p1(0x3e),p1(0x4b),p1(0xc6),p1(0xd2),p1(0x79),p1(0x20) ;\
667         .long   p1(0x9a),p1(0xdb),p1(0xc0),p1(0xfe),p1(0x78),p1(0xcd),p1(0x5a),p1(0xf4)
668 #define ib_data6(p1) \
669         .long   p1(0x1f),p1(0xdd),p1(0xa8),p1(0x33),p1(0x88),p1(0x07),p1(0xc7),p1(0x31) ;\
670         .long   p1(0xb1),p1(0x12),p1(0x10),p1(0x59),p1(0x27),p1(0x80),p1(0xec),p1(0x5f) ;\
671         .long   p1(0x60),p1(0x51),p1(0x7f),p1(0xa9),p1(0x19),p1(0xb5),p1(0x4a),p1(0x0d) ;\
672         .long   p1(0x2d),p1(0xe5),p1(0x7a),p1(0x9f),p1(0x93),p1(0xc9),p1(0x9c),p1(0xef)
673 #define ib_data7(p1) \
674         .long   p1(0xa0),p1(0xe0),p1(0x3b),p1(0x4d),p1(0xae),p1(0x2a),p1(0xf5),p1(0xb0) ;\
675         .long   p1(0xc8),p1(0xeb),p1(0xbb),p1(0x3c),p1(0x83),p1(0x53),p1(0x99),p1(0x61) ;\
676         .long   p1(0x17),p1(0x2b),p1(0x04),p1(0x7e),p1(0xba),p1(0x77),p1(0xd6),p1(0x26) ;\
677         .long   p1(0xe1),p1(0x69),p1(0x14),p1(0x63),p1(0x55),p1(0x21),p1(0x0c),p1(0x7d)
678
679 // The rcon_table (needed for the key schedule)
680 //
681 // Here is original Dr Brian Gladman's source code:
682 //      _rcon_tab:
683 //      %assign x   1
684 //      %rep 29
685 //          dd  x
686 //      %assign x f2(x)
687 //      %endrep
688 //
689 // Here is precomputed output (it's more portable this way):
690
691         .align  ALIGN32BYTES
692 aes_rcon_tab:
693         .long   0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
694         .long   0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f
695         .long   0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4
696         .long   0xb3,0x7d,0xfa,0xef,0xc5
697
698 // The forward xor tables
699
700         .align  ALIGN32BYTES
701 aes_ft_tab:
702         sb_data0(u0)
703         sb_data1(u0)
704         sb_data2(u0)
705         sb_data3(u0)
706         sb_data4(u0)
707         sb_data5(u0)
708         sb_data6(u0)
709         sb_data7(u0)
710
711         sb_data0(u1)
712         sb_data1(u1)
713         sb_data2(u1)
714         sb_data3(u1)
715         sb_data4(u1)
716         sb_data5(u1)
717         sb_data6(u1)
718         sb_data7(u1)
719
720         sb_data0(u2)
721         sb_data1(u2)
722         sb_data2(u2)
723         sb_data3(u2)
724         sb_data4(u2)
725         sb_data5(u2)
726         sb_data6(u2)
727         sb_data7(u2)
728
729         sb_data0(u3)
730         sb_data1(u3)
731         sb_data2(u3)
732         sb_data3(u3)
733         sb_data4(u3)
734         sb_data5(u3)
735         sb_data6(u3)
736         sb_data7(u3)
737
738         .align  ALIGN32BYTES
739 aes_fl_tab:
740         sb_data0(w0)
741         sb_data1(w0)
742         sb_data2(w0)
743         sb_data3(w0)
744         sb_data4(w0)
745         sb_data5(w0)
746         sb_data6(w0)
747         sb_data7(w0)
748
749         sb_data0(w1)
750         sb_data1(w1)
751         sb_data2(w1)
752         sb_data3(w1)
753         sb_data4(w1)
754         sb_data5(w1)
755         sb_data6(w1)
756         sb_data7(w1)
757
758         sb_data0(w2)
759         sb_data1(w2)
760         sb_data2(w2)
761         sb_data3(w2)
762         sb_data4(w2)
763         sb_data5(w2)
764         sb_data6(w2)
765         sb_data7(w2)
766
767         sb_data0(w3)
768         sb_data1(w3)
769         sb_data2(w3)
770         sb_data3(w3)
771         sb_data4(w3)
772         sb_data5(w3)
773         sb_data6(w3)
774         sb_data7(w3)
775
776 // The inverse xor tables
777
778         .align  ALIGN32BYTES
779 aes_it_tab:
780         ib_data0(v0)
781         ib_data1(v0)
782         ib_data2(v0)
783         ib_data3(v0)
784         ib_data4(v0)
785         ib_data5(v0)
786         ib_data6(v0)
787         ib_data7(v0)
788
789         ib_data0(v1)
790         ib_data1(v1)
791         ib_data2(v1)
792         ib_data3(v1)
793         ib_data4(v1)
794         ib_data5(v1)
795         ib_data6(v1)
796         ib_data7(v1)
797
798         ib_data0(v2)
799         ib_data1(v2)
800         ib_data2(v2)
801         ib_data3(v2)
802         ib_data4(v2)
803         ib_data5(v2)
804         ib_data6(v2)
805         ib_data7(v2)
806
807         ib_data0(v3)
808         ib_data1(v3)
809         ib_data2(v3)
810         ib_data3(v3)
811         ib_data4(v3)
812         ib_data5(v3)
813         ib_data6(v3)
814         ib_data7(v3)
815
816         .align  ALIGN32BYTES
817 aes_il_tab:
818         ib_data0(w0)
819         ib_data1(w0)
820         ib_data2(w0)
821         ib_data3(w0)
822         ib_data4(w0)
823         ib_data5(w0)
824         ib_data6(w0)
825         ib_data7(w0)
826
827         ib_data0(w1)
828         ib_data1(w1)
829         ib_data2(w1)
830         ib_data3(w1)
831         ib_data4(w1)
832         ib_data5(w1)
833         ib_data6(w1)
834         ib_data7(w1)
835
836         ib_data0(w2)
837         ib_data1(w2)
838         ib_data2(w2)
839         ib_data3(w2)
840         ib_data4(w2)
841         ib_data5(w2)
842         ib_data6(w2)
843         ib_data7(w2)
844
845         ib_data0(w3)
846         ib_data1(w3)
847         ib_data2(w3)
848         ib_data3(w3)
849         ib_data4(w3)
850         ib_data5(w3)
851         ib_data6(w3)
852         ib_data7(w3)
853
854 // The inverse mix column tables
855
856         .align  ALIGN32BYTES
857 aes_im_tab:
858         im_data0(v0)
859         im_data1(v0)
860         im_data2(v0)
861         im_data3(v0)
862         im_data4(v0)
863         im_data5(v0)
864         im_data6(v0)
865         im_data7(v0)
866
867         im_data0(v1)
868         im_data1(v1)
869         im_data2(v1)
870         im_data3(v1)
871         im_data4(v1)
872         im_data5(v1)
873         im_data6(v1)
874         im_data7(v1)
875
876         im_data0(v2)
877         im_data1(v2)
878         im_data2(v2)
879         im_data3(v2)
880         im_data4(v2)
881         im_data5(v2)
882         im_data6(v2)
883         im_data7(v2)
884
885         im_data0(v3)
886         im_data1(v3)
887         im_data2(v3)
888         im_data3(v3)
889         im_data4(v3)
890         im_data5(v3)
891         im_data6(v3)
892         im_data7(v3)