checksum_64.S 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. /*
  2. * This file contains assembly-language implementations
  3. * of IP-style 1's complement checksum routines.
  4. *
  5. * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. *
  12. * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13. */
  14. #include <linux/sys.h>
  15. #include <asm/processor.h>
  16. #include <asm/errno.h>
  17. #include <asm/ppc_asm.h>
  18. #include <asm/export.h>
  19. /*
  20. * Computes the checksum of a memory block at buff, length len,
  21. * and adds in "sum" (32-bit).
  22. *
  23. * __csum_partial(r3=buff, r4=len, r5=sum)
  24. */
  25. _GLOBAL(__csum_partial)
  26. addic r0,r5,0 /* clear carry */
  27. srdi. r6,r4,3 /* less than 8 bytes? */
  28. beq .Lcsum_tail_word
  29. /*
  30. * If only halfword aligned, align to a double word. Since odd
  31. * aligned addresses should be rare and they would require more
  32. * work to calculate the correct checksum, we ignore that case
  33. * and take the potential slowdown of unaligned loads.
  34. */
  35. rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
  36. beq .Lcsum_aligned
  37. li r7,4
  38. sub r6,r7,r6
  39. mtctr r6
  40. 1:
  41. lhz r6,0(r3) /* align to doubleword */
  42. subi r4,r4,2
  43. addi r3,r3,2
  44. adde r0,r0,r6
  45. bdnz 1b
  46. .Lcsum_aligned:
  47. /*
  48. * We unroll the loop such that each iteration is 64 bytes with an
  49. * entry and exit limb of 64 bytes, meaning a minimum size of
  50. * 128 bytes.
  51. */
  52. srdi. r6,r4,7
  53. beq .Lcsum_tail_doublewords /* len < 128 */
  54. srdi r6,r4,6
  55. subi r6,r6,1
  56. mtctr r6
  57. stdu r1,-STACKFRAMESIZE(r1)
  58. std r14,STK_REG(R14)(r1)
  59. std r15,STK_REG(R15)(r1)
  60. std r16,STK_REG(R16)(r1)
  61. ld r6,0(r3)
  62. ld r9,8(r3)
  63. ld r10,16(r3)
  64. ld r11,24(r3)
  65. /*
  66. * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  67. * because of the XER dependency. This means the fastest this loop can
  68. * go is 16 cycles per iteration. The scheduling of the loop below has
  69. * been shown to hit this on both POWER6 and POWER7.
  70. */
  71. .align 5
  72. 2:
  73. adde r0,r0,r6
  74. ld r12,32(r3)
  75. ld r14,40(r3)
  76. adde r0,r0,r9
  77. ld r15,48(r3)
  78. ld r16,56(r3)
  79. addi r3,r3,64
  80. adde r0,r0,r10
  81. adde r0,r0,r11
  82. adde r0,r0,r12
  83. adde r0,r0,r14
  84. adde r0,r0,r15
  85. ld r6,0(r3)
  86. ld r9,8(r3)
  87. adde r0,r0,r16
  88. ld r10,16(r3)
  89. ld r11,24(r3)
  90. bdnz 2b
  91. adde r0,r0,r6
  92. ld r12,32(r3)
  93. ld r14,40(r3)
  94. adde r0,r0,r9
  95. ld r15,48(r3)
  96. ld r16,56(r3)
  97. addi r3,r3,64
  98. adde r0,r0,r10
  99. adde r0,r0,r11
  100. adde r0,r0,r12
  101. adde r0,r0,r14
  102. adde r0,r0,r15
  103. adde r0,r0,r16
  104. ld r14,STK_REG(R14)(r1)
  105. ld r15,STK_REG(R15)(r1)
  106. ld r16,STK_REG(R16)(r1)
  107. addi r1,r1,STACKFRAMESIZE
  108. andi. r4,r4,63
  109. .Lcsum_tail_doublewords: /* Up to 127 bytes to go */
  110. srdi. r6,r4,3
  111. beq .Lcsum_tail_word
  112. mtctr r6
  113. 3:
  114. ld r6,0(r3)
  115. addi r3,r3,8
  116. adde r0,r0,r6
  117. bdnz 3b
  118. andi. r4,r4,7
  119. .Lcsum_tail_word: /* Up to 7 bytes to go */
  120. srdi. r6,r4,2
  121. beq .Lcsum_tail_halfword
  122. lwz r6,0(r3)
  123. addi r3,r3,4
  124. adde r0,r0,r6
  125. subi r4,r4,4
  126. .Lcsum_tail_halfword: /* Up to 3 bytes to go */
  127. srdi. r6,r4,1
  128. beq .Lcsum_tail_byte
  129. lhz r6,0(r3)
  130. addi r3,r3,2
  131. adde r0,r0,r6
  132. subi r4,r4,2
  133. .Lcsum_tail_byte: /* Up to 1 byte to go */
  134. andi. r6,r4,1
  135. beq .Lcsum_finish
  136. lbz r6,0(r3)
  137. #ifdef __BIG_ENDIAN__
  138. sldi r9,r6,8 /* Pad the byte out to 16 bits */
  139. adde r0,r0,r9
  140. #else
  141. adde r0,r0,r6
  142. #endif
  143. .Lcsum_finish:
  144. addze r0,r0 /* add in final carry */
  145. rldicl r4,r0,32,0 /* fold two 32 bit halves together */
  146. add r3,r4,r0
  147. srdi r3,r3,32
  148. blr
  149. EXPORT_SYMBOL(__csum_partial)
  150. .macro srcnr
  151. 100:
  152. EX_TABLE(100b,.Lsrc_error_nr)
  153. .endm
  154. .macro source
  155. 150:
  156. EX_TABLE(150b,.Lsrc_error)
  157. .endm
  158. .macro dstnr
  159. 200:
  160. EX_TABLE(200b,.Ldest_error_nr)
  161. .endm
  162. .macro dest
  163. 250:
  164. EX_TABLE(250b,.Ldest_error)
  165. .endm
  166. /*
  167. * Computes the checksum of a memory block at src, length len,
  168. * and adds in "sum" (32-bit), while copying the block to dst.
  169. * If an access exception occurs on src or dst, it stores -EFAULT
  170. * to *src_err or *dst_err respectively. The caller must take any action
  171. * required in this case (zeroing memory, recalculating partial checksum etc).
  172. *
  173. * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
  174. */
  175. _GLOBAL(csum_partial_copy_generic)
  176. addic r0,r6,0 /* clear carry */
  177. srdi. r6,r5,3 /* less than 8 bytes? */
  178. beq .Lcopy_tail_word
  179. /*
  180. * If only halfword aligned, align to a double word. Since odd
  181. * aligned addresses should be rare and they would require more
  182. * work to calculate the correct checksum, we ignore that case
  183. * and take the potential slowdown of unaligned loads.
  184. *
  185. * If the source and destination are relatively unaligned we only
  186. * align the source. This keeps things simple.
  187. */
  188. rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
  189. beq .Lcopy_aligned
  190. li r9,4
  191. sub r6,r9,r6
  192. mtctr r6
  193. 1:
  194. srcnr; lhz r6,0(r3) /* align to doubleword */
  195. subi r5,r5,2
  196. addi r3,r3,2
  197. adde r0,r0,r6
  198. dstnr; sth r6,0(r4)
  199. addi r4,r4,2
  200. bdnz 1b
  201. .Lcopy_aligned:
  202. /*
  203. * We unroll the loop such that each iteration is 64 bytes with an
  204. * entry and exit limb of 64 bytes, meaning a minimum size of
  205. * 128 bytes.
  206. */
  207. srdi. r6,r5,7
  208. beq .Lcopy_tail_doublewords /* len < 128 */
  209. srdi r6,r5,6
  210. subi r6,r6,1
  211. mtctr r6
  212. stdu r1,-STACKFRAMESIZE(r1)
  213. std r14,STK_REG(R14)(r1)
  214. std r15,STK_REG(R15)(r1)
  215. std r16,STK_REG(R16)(r1)
  216. source; ld r6,0(r3)
  217. source; ld r9,8(r3)
  218. source; ld r10,16(r3)
  219. source; ld r11,24(r3)
  220. /*
  221. * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  222. * because of the XER dependency. This means the fastest this loop can
  223. * go is 16 cycles per iteration. The scheduling of the loop below has
  224. * been shown to hit this on both POWER6 and POWER7.
  225. */
  226. .align 5
  227. 2:
  228. adde r0,r0,r6
  229. source; ld r12,32(r3)
  230. source; ld r14,40(r3)
  231. adde r0,r0,r9
  232. source; ld r15,48(r3)
  233. source; ld r16,56(r3)
  234. addi r3,r3,64
  235. adde r0,r0,r10
  236. dest; std r6,0(r4)
  237. dest; std r9,8(r4)
  238. adde r0,r0,r11
  239. dest; std r10,16(r4)
  240. dest; std r11,24(r4)
  241. adde r0,r0,r12
  242. dest; std r12,32(r4)
  243. dest; std r14,40(r4)
  244. adde r0,r0,r14
  245. dest; std r15,48(r4)
  246. dest; std r16,56(r4)
  247. addi r4,r4,64
  248. adde r0,r0,r15
  249. source; ld r6,0(r3)
  250. source; ld r9,8(r3)
  251. adde r0,r0,r16
  252. source; ld r10,16(r3)
  253. source; ld r11,24(r3)
  254. bdnz 2b
  255. adde r0,r0,r6
  256. source; ld r12,32(r3)
  257. source; ld r14,40(r3)
  258. adde r0,r0,r9
  259. source; ld r15,48(r3)
  260. source; ld r16,56(r3)
  261. addi r3,r3,64
  262. adde r0,r0,r10
  263. dest; std r6,0(r4)
  264. dest; std r9,8(r4)
  265. adde r0,r0,r11
  266. dest; std r10,16(r4)
  267. dest; std r11,24(r4)
  268. adde r0,r0,r12
  269. dest; std r12,32(r4)
  270. dest; std r14,40(r4)
  271. adde r0,r0,r14
  272. dest; std r15,48(r4)
  273. dest; std r16,56(r4)
  274. addi r4,r4,64
  275. adde r0,r0,r15
  276. adde r0,r0,r16
  277. ld r14,STK_REG(R14)(r1)
  278. ld r15,STK_REG(R15)(r1)
  279. ld r16,STK_REG(R16)(r1)
  280. addi r1,r1,STACKFRAMESIZE
  281. andi. r5,r5,63
  282. .Lcopy_tail_doublewords: /* Up to 127 bytes to go */
  283. srdi. r6,r5,3
  284. beq .Lcopy_tail_word
  285. mtctr r6
  286. 3:
  287. srcnr; ld r6,0(r3)
  288. addi r3,r3,8
  289. adde r0,r0,r6
  290. dstnr; std r6,0(r4)
  291. addi r4,r4,8
  292. bdnz 3b
  293. andi. r5,r5,7
  294. .Lcopy_tail_word: /* Up to 7 bytes to go */
  295. srdi. r6,r5,2
  296. beq .Lcopy_tail_halfword
  297. srcnr; lwz r6,0(r3)
  298. addi r3,r3,4
  299. adde r0,r0,r6
  300. dstnr; stw r6,0(r4)
  301. addi r4,r4,4
  302. subi r5,r5,4
  303. .Lcopy_tail_halfword: /* Up to 3 bytes to go */
  304. srdi. r6,r5,1
  305. beq .Lcopy_tail_byte
  306. srcnr; lhz r6,0(r3)
  307. addi r3,r3,2
  308. adde r0,r0,r6
  309. dstnr; sth r6,0(r4)
  310. addi r4,r4,2
  311. subi r5,r5,2
  312. .Lcopy_tail_byte: /* Up to 1 byte to go */
  313. andi. r6,r5,1
  314. beq .Lcopy_finish
  315. srcnr; lbz r6,0(r3)
  316. #ifdef __BIG_ENDIAN__
  317. sldi r9,r6,8 /* Pad the byte out to 16 bits */
  318. adde r0,r0,r9
  319. #else
  320. adde r0,r0,r6
  321. #endif
  322. dstnr; stb r6,0(r4)
  323. .Lcopy_finish:
  324. addze r0,r0 /* add in final carry */
  325. rldicl r4,r0,32,0 /* fold two 32 bit halves together */
  326. add r3,r4,r0
  327. srdi r3,r3,32
  328. blr
  329. .Lsrc_error:
  330. ld r14,STK_REG(R14)(r1)
  331. ld r15,STK_REG(R15)(r1)
  332. ld r16,STK_REG(R16)(r1)
  333. addi r1,r1,STACKFRAMESIZE
  334. .Lsrc_error_nr:
  335. cmpdi 0,r7,0
  336. beqlr
  337. li r6,-EFAULT
  338. stw r6,0(r7)
  339. blr
  340. .Ldest_error:
  341. ld r14,STK_REG(R14)(r1)
  342. ld r15,STK_REG(R15)(r1)
  343. ld r16,STK_REG(R16)(r1)
  344. addi r1,r1,STACKFRAMESIZE
  345. .Ldest_error_nr:
  346. cmpdi 0,r8,0
  347. beqlr
  348. li r6,-EFAULT
  349. stw r6,0(r8)
  350. blr
  351. EXPORT_SYMBOL(csum_partial_copy_generic)