arm / sha1_arm.Son commit diff-delta.c: Rationalize culling of hash buckets (02e665c)
   1/*
   2 *  SHA transform optimized for ARM
   3 *
   4 *  Copyright:  (C) 2005 by Nicolas Pitre <nico@cam.org>
   5 *  Created:    September 17, 2005
   6 *
   7 *  This program is free software; you can redistribute it and/or modify
   8 *  it under the terms of the GNU General Public License version 2 as
   9 *  published by the Free Software Foundation.
  10 */
  11
  12        .text
  13        .globl  sha_transform
  14
  15/*
  16 * void sha_transform(uint32_t *hash, const unsigned char *data, uint32_t *W);
  17 *
  18 * note: the "data" pointer may be unaligned.
  19 */
  20
  21sha_transform:
  22
  23        stmfd   sp!, {r4 - r8, lr}
  24
  25        @ for (i = 0; i < 16; i++)
  26        @         W[i] = ntohl(((uint32_t *)data)[i]);
  27
  28#ifdef __ARMEB__
  29        mov     r4, r0
  30        mov     r0, r2
  31        mov     r2, #64
  32        bl      memcpy
  33        mov     r2, r0
  34        mov     r0, r4
  35#else
  36        mov     r3, r2
  37        mov     lr, #16
  381:      ldrb    r4, [r1], #1
  39        ldrb    r5, [r1], #1
  40        ldrb    r6, [r1], #1
  41        ldrb    r7, [r1], #1
  42        subs    lr, lr, #1
  43        orr     r5, r5, r4, lsl #8
  44        orr     r6, r6, r5, lsl #8
  45        orr     r7, r7, r6, lsl #8
  46        str     r7, [r3], #4
  47        bne     1b
  48#endif
  49
  50        @ for (i = 0; i < 64; i++)
  51        @         W[i+16] = ror(W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i], 31);
  52
  53        sub     r3, r2, #4
  54        mov     lr, #64
  552:      ldr     r4, [r3, #4]!
  56        subs    lr, lr, #1
  57        ldr     r5, [r3, #8]
  58        ldr     r6, [r3, #32]
  59        ldr     r7, [r3, #52]
  60        eor     r4, r4, r5
  61        eor     r4, r4, r6
  62        eor     r4, r4, r7
  63        mov     r4, r4, ror #31
  64        str     r4, [r3, #64]
  65        bne     2b
  66
  67        /*
  68         * The SHA functions are:
  69         *
  70         * f1(B,C,D) = (D ^ (B & (C ^ D)))
  71         * f2(B,C,D) = (B ^ C ^ D)
  72         * f3(B,C,D) = ((B & C) | (D & (B | C)))
  73         *
  74         * Then the sub-blocks are processed as follows:
  75         *
  76         * A' = ror(A, 27) + f(B,C,D) + E + K + *W++
  77         * B' = A
  78         * C' = ror(B, 2)
  79         * D' = C
  80         * E' = D
  81         *
  82         * We therefore unroll each loop 5 times to avoid register shuffling.
  83         * Also the ror for C (and also D and E which are successivelyderived
  84         * from it) is applied in place to cut on an additional mov insn for
  85         * each round.
  86         */
  87
  88        .macro  sha_f1, A, B, C, D, E
  89        ldr     r3, [r2], #4
  90        eor     ip, \C, \D
  91        add     \E, r1, \E, ror #2
  92        and     ip, \B, ip, ror #2
  93        add     \E, \E, \A, ror #27
  94        eor     ip, ip, \D, ror #2
  95        add     \E, \E, r3
  96        add     \E, \E, ip
  97        .endm
  98
  99        .macro  sha_f2, A, B, C, D, E
 100        ldr     r3, [r2], #4
 101        add     \E, r1, \E, ror #2
 102        eor     ip, \B, \C, ror #2
 103        add     \E, \E, \A, ror #27
 104        eor     ip, ip, \D, ror #2
 105        add     \E, \E, r3
 106        add     \E, \E, ip
 107        .endm
 108
 109        .macro  sha_f3, A, B, C, D, E
 110        ldr     r3, [r2], #4
 111        add     \E, r1, \E, ror #2
 112        orr     ip, \B, \C, ror #2
 113        add     \E, \E, \A, ror #27
 114        and     ip, ip, \D, ror #2
 115        add     \E, \E, r3
 116        and     r3, \B, \C, ror #2
 117        orr     ip, ip, r3
 118        add     \E, \E, ip
 119        .endm
 120
 121        ldmia   r0, {r4 - r8}
 122
 123        mov     lr, #4
 124        ldr     r1, .L_sha_K + 0
 125
 126        /* adjust initial values */
 127        mov     r6, r6, ror #30
 128        mov     r7, r7, ror #30
 129        mov     r8, r8, ror #30
 130
 1313:      subs    lr, lr, #1
 132        sha_f1  r4, r5, r6, r7, r8
 133        sha_f1  r8, r4, r5, r6, r7
 134        sha_f1  r7, r8, r4, r5, r6
 135        sha_f1  r6, r7, r8, r4, r5
 136        sha_f1  r5, r6, r7, r8, r4
 137        bne     3b
 138
 139        ldr     r1, .L_sha_K + 4
 140        mov     lr, #4
 141
 1424:      subs    lr, lr, #1
 143        sha_f2  r4, r5, r6, r7, r8
 144        sha_f2  r8, r4, r5, r6, r7
 145        sha_f2  r7, r8, r4, r5, r6
 146        sha_f2  r6, r7, r8, r4, r5
 147        sha_f2  r5, r6, r7, r8, r4
 148        bne     4b
 149
 150        ldr     r1, .L_sha_K + 8
 151        mov     lr, #4
 152
 1535:      subs    lr, lr, #1
 154        sha_f3  r4, r5, r6, r7, r8
 155        sha_f3  r8, r4, r5, r6, r7
 156        sha_f3  r7, r8, r4, r5, r6
 157        sha_f3  r6, r7, r8, r4, r5
 158        sha_f3  r5, r6, r7, r8, r4
 159        bne     5b
 160
 161        ldr     r1, .L_sha_K + 12
 162        mov     lr, #4
 163
 1646:      subs    lr, lr, #1
 165        sha_f2  r4, r5, r6, r7, r8
 166        sha_f2  r8, r4, r5, r6, r7
 167        sha_f2  r7, r8, r4, r5, r6
 168        sha_f2  r6, r7, r8, r4, r5
 169        sha_f2  r5, r6, r7, r8, r4
 170        bne     6b
 171
 172        ldmia   r0, {r1, r2, r3, ip, lr}
 173        add     r4, r1, r4
 174        add     r5, r2, r5
 175        add     r6, r3, r6, ror #2
 176        add     r7, ip, r7, ror #2
 177        add     r8, lr, r8, ror #2
 178        stmia   r0, {r4 - r8}
 179
 180        ldmfd   sp!, {r4 - r8, pc}
 181
 182.L_sha_K:
 183        .word   0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6