checksum_32.S 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. /*
  2. * This file contains assembly-language implementations
  3. * of IP-style 1's complement checksum routines.
  4. *
  5. * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. *
  12. * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13. */
  14. #include <linux/sys.h>
  15. #include <asm/processor.h>
  16. #include <asm/cache.h>
  17. #include <asm/errno.h>
  18. #include <asm/ppc_asm.h>
  19. #include <asm/export.h>
  20. .text
  21. /*
  22. * computes the checksum of a memory block at buff, length len,
  23. * and adds in "sum" (32-bit)
  24. *
  25. * __csum_partial(buff, len, sum)
  26. */
  27. _GLOBAL(__csum_partial)
  28. subi r3,r3,4
  29. srawi. r6,r4,2 /* Divide len by 4 and also clear carry */
  30. beq 3f /* if we're doing < 4 bytes */
  31. andi. r0,r3,2 /* Align buffer to longword boundary */
  32. beq+ 1f
  33. lhz r0,4(r3) /* do 2 bytes to get aligned */
  34. subi r4,r4,2
  35. addi r3,r3,2
  36. srwi. r6,r4,2 /* # words to do */
  37. adde r5,r5,r0
  38. beq 3f
  39. 1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */
  40. beq 21f
  41. mtctr r6
  42. 2: lwzu r0,4(r3)
  43. adde r5,r5,r0
  44. bdnz 2b
  45. 21: srwi. r6,r4,4 /* # blocks of 4 words to do */
  46. beq 3f
  47. mtctr r6
  48. 22: lwz r0,4(r3)
  49. lwz r6,8(r3)
  50. lwz r7,12(r3)
  51. lwzu r8,16(r3)
  52. adde r5,r5,r0
  53. adde r5,r5,r6
  54. adde r5,r5,r7
  55. adde r5,r5,r8
  56. bdnz 22b
  57. 3: andi. r0,r4,2
  58. beq+ 4f
  59. lhz r0,4(r3)
  60. addi r3,r3,2
  61. adde r5,r5,r0
  62. 4: andi. r0,r4,1
  63. beq+ 5f
  64. lbz r0,4(r3)
  65. slwi r0,r0,8 /* Upper byte of word */
  66. adde r5,r5,r0
  67. 5: addze r3,r5 /* add in final carry */
  68. blr
  69. EXPORT_SYMBOL(__csum_partial)
  70. /*
  71. * Computes the checksum of a memory block at src, length len,
  72. * and adds in "sum" (32-bit), while copying the block to dst.
  73. * If an access exception occurs on src or dst, it stores -EFAULT
  74. * to *src_err or *dst_err respectively, and (for an error on
  75. * src) zeroes the rest of dst.
  76. *
  77. * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err)
  78. */
  79. #define CSUM_COPY_16_BYTES_WITHEX(n) \
  80. 8 ## n ## 0: \
  81. lwz r7,4(r4); \
  82. 8 ## n ## 1: \
  83. lwz r8,8(r4); \
  84. 8 ## n ## 2: \
  85. lwz r9,12(r4); \
  86. 8 ## n ## 3: \
  87. lwzu r10,16(r4); \
  88. 8 ## n ## 4: \
  89. stw r7,4(r6); \
  90. adde r12,r12,r7; \
  91. 8 ## n ## 5: \
  92. stw r8,8(r6); \
  93. adde r12,r12,r8; \
  94. 8 ## n ## 6: \
  95. stw r9,12(r6); \
  96. adde r12,r12,r9; \
  97. 8 ## n ## 7: \
  98. stwu r10,16(r6); \
  99. adde r12,r12,r10
  100. #define CSUM_COPY_16_BYTES_EXCODE(n) \
  101. .section __ex_table,"a"; \
  102. .align 2; \
  103. .long 8 ## n ## 0b,src_error; \
  104. .long 8 ## n ## 1b,src_error; \
  105. .long 8 ## n ## 2b,src_error; \
  106. .long 8 ## n ## 3b,src_error; \
  107. .long 8 ## n ## 4b,dst_error; \
  108. .long 8 ## n ## 5b,dst_error; \
  109. .long 8 ## n ## 6b,dst_error; \
  110. .long 8 ## n ## 7b,dst_error; \
  111. .text
  112. .text
  113. .stabs "arch/powerpc/lib/",N_SO,0,0,0f
  114. .stabs "checksum_32.S",N_SO,0,0,0f
  115. 0:
  116. CACHELINE_BYTES = L1_CACHE_BYTES
  117. LG_CACHELINE_BYTES = L1_CACHE_SHIFT
  118. CACHELINE_MASK = (L1_CACHE_BYTES-1)
  119. _GLOBAL(csum_partial_copy_generic)
  120. stwu r1,-16(r1)
  121. stw r7,12(r1)
  122. stw r8,8(r1)
  123. addic r12,r6,0
  124. addi r6,r4,-4
  125. neg r0,r4
  126. addi r4,r3,-4
  127. andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
  128. crset 4*cr7+eq
  129. beq 58f
  130. cmplw 0,r5,r0 /* is this more than total to do? */
  131. blt 63f /* if not much to do */
  132. rlwinm r7,r6,3,0x8
  133. rlwnm r12,r12,r7,0,31 /* odd destination address: rotate one byte */
  134. cmplwi cr7,r7,0 /* is destination address even ? */
  135. andi. r8,r0,3 /* get it word-aligned first */
  136. mtctr r8
  137. beq+ 61f
  138. li r3,0
  139. 70: lbz r9,4(r4) /* do some bytes */
  140. addi r4,r4,1
  141. slwi r3,r3,8
  142. rlwimi r3,r9,0,24,31
  143. 71: stb r9,4(r6)
  144. addi r6,r6,1
  145. bdnz 70b
  146. adde r12,r12,r3
  147. 61: subf r5,r0,r5
  148. srwi. r0,r0,2
  149. mtctr r0
  150. beq 58f
  151. 72: lwzu r9,4(r4) /* do some words */
  152. adde r12,r12,r9
  153. 73: stwu r9,4(r6)
  154. bdnz 72b
  155. 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
  156. clrlwi r5,r5,32-LG_CACHELINE_BYTES
  157. li r11,4
  158. beq 63f
  159. /* Here we decide how far ahead to prefetch the source */
  160. li r3,4
  161. cmpwi r0,1
  162. li r7,0
  163. ble 114f
  164. li r7,1
  165. #if MAX_COPY_PREFETCH > 1
  166. /* Heuristically, for large transfers we prefetch
  167. MAX_COPY_PREFETCH cachelines ahead. For small transfers
  168. we prefetch 1 cacheline ahead. */
  169. cmpwi r0,MAX_COPY_PREFETCH
  170. ble 112f
  171. li r7,MAX_COPY_PREFETCH
  172. 112: mtctr r7
  173. 111: dcbt r3,r4
  174. addi r3,r3,CACHELINE_BYTES
  175. bdnz 111b
  176. #else
  177. dcbt r3,r4
  178. addi r3,r3,CACHELINE_BYTES
  179. #endif /* MAX_COPY_PREFETCH > 1 */
  180. 114: subf r8,r7,r0
  181. mr r0,r7
  182. mtctr r8
  183. 53: dcbt r3,r4
  184. 54: dcbz r11,r6
  185. /* the main body of the cacheline loop */
  186. CSUM_COPY_16_BYTES_WITHEX(0)
  187. #if L1_CACHE_BYTES >= 32
  188. CSUM_COPY_16_BYTES_WITHEX(1)
  189. #if L1_CACHE_BYTES >= 64
  190. CSUM_COPY_16_BYTES_WITHEX(2)
  191. CSUM_COPY_16_BYTES_WITHEX(3)
  192. #if L1_CACHE_BYTES >= 128
  193. CSUM_COPY_16_BYTES_WITHEX(4)
  194. CSUM_COPY_16_BYTES_WITHEX(5)
  195. CSUM_COPY_16_BYTES_WITHEX(6)
  196. CSUM_COPY_16_BYTES_WITHEX(7)
  197. #endif
  198. #endif
  199. #endif
  200. bdnz 53b
  201. cmpwi r0,0
  202. li r3,4
  203. li r7,0
  204. bne 114b
  205. 63: srwi. r0,r5,2
  206. mtctr r0
  207. beq 64f
  208. 30: lwzu r0,4(r4)
  209. adde r12,r12,r0
  210. 31: stwu r0,4(r6)
  211. bdnz 30b
  212. 64: andi. r0,r5,2
  213. beq+ 65f
  214. 40: lhz r0,4(r4)
  215. addi r4,r4,2
  216. 41: sth r0,4(r6)
  217. adde r12,r12,r0
  218. addi r6,r6,2
  219. 65: andi. r0,r5,1
  220. beq+ 66f
  221. 50: lbz r0,4(r4)
  222. 51: stb r0,4(r6)
  223. slwi r0,r0,8
  224. adde r12,r12,r0
  225. 66: addze r3,r12
  226. addi r1,r1,16
  227. beqlr+ cr7
  228. rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */
  229. blr
  230. /* read fault */
  231. src_error:
  232. lwz r7,12(r1)
  233. addi r1,r1,16
  234. cmpwi cr0,r7,0
  235. beqlr
  236. li r0,-EFAULT
  237. stw r0,0(r7)
  238. blr
  239. /* write fault */
  240. dst_error:
  241. lwz r8,8(r1)
  242. addi r1,r1,16
  243. cmpwi cr0,r8,0
  244. beqlr
  245. li r0,-EFAULT
  246. stw r0,0(r8)
  247. blr
  248. .section __ex_table,"a"
  249. .align 2
  250. .long 70b,src_error
  251. .long 71b,dst_error
  252. .long 72b,src_error
  253. .long 73b,dst_error
  254. .long 54b,dst_error
  255. .text
  256. /*
  257. * this stuff handles faults in the cacheline loop and branches to either
  258. * src_error (if in read part) or dst_error (if in write part)
  259. */
  260. CSUM_COPY_16_BYTES_EXCODE(0)
  261. #if L1_CACHE_BYTES >= 32
  262. CSUM_COPY_16_BYTES_EXCODE(1)
  263. #if L1_CACHE_BYTES >= 64
  264. CSUM_COPY_16_BYTES_EXCODE(2)
  265. CSUM_COPY_16_BYTES_EXCODE(3)
  266. #if L1_CACHE_BYTES >= 128
  267. CSUM_COPY_16_BYTES_EXCODE(4)
  268. CSUM_COPY_16_BYTES_EXCODE(5)
  269. CSUM_COPY_16_BYTES_EXCODE(6)
  270. CSUM_COPY_16_BYTES_EXCODE(7)
  271. #endif
  272. #endif
  273. #endif
  274. .section __ex_table,"a"
  275. .align 2
  276. .long 30b,src_error
  277. .long 31b,dst_error
  278. .long 40b,src_error
  279. .long 41b,dst_error
  280. .long 50b,src_error
  281. .long 51b,dst_error
  282. EXPORT_SYMBOL(csum_partial_copy_generic)