checksum_64.S 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. /*
  2. * This file contains assembly-language implementations
  3. * of IP-style 1's complement checksum routines.
  4. *
  5. * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. *
  12. * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13. */
  14. #include <linux/sys.h>
  15. #include <asm/processor.h>
  16. #include <asm/errno.h>
  17. #include <asm/ppc_asm.h>
  18. #include <asm/export.h>
  19. /*
  20. * Computes the checksum of a memory block at buff, length len,
  21. * and adds in "sum" (32-bit).
  22. *
  23. * __csum_partial(r3=buff, r4=len, r5=sum)
  24. */
  25. _GLOBAL(__csum_partial)
  26. addic r0,r5,0 /* clear carry */
  27. srdi. r6,r4,3 /* less than 8 bytes? */
  28. beq .Lcsum_tail_word
  29. /*
  30. * If only halfword aligned, align to a double word. Since odd
  31. * aligned addresses should be rare and they would require more
  32. * work to calculate the correct checksum, we ignore that case
  33. * and take the potential slowdown of unaligned loads.
  34. */
  35. rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
  36. beq .Lcsum_aligned
  37. li r7,4
  38. sub r6,r7,r6
  39. mtctr r6
  40. 1:
  41. lhz r6,0(r3) /* align to doubleword */
  42. subi r4,r4,2
  43. addi r3,r3,2
  44. adde r0,r0,r6
  45. bdnz 1b
  46. .Lcsum_aligned:
  47. /*
  48. * We unroll the loop such that each iteration is 64 bytes with an
  49. * entry and exit limb of 64 bytes, meaning a minimum size of
  50. * 128 bytes.
  51. */
  52. srdi. r6,r4,7
  53. beq .Lcsum_tail_doublewords /* len < 128 */
  54. srdi r6,r4,6
  55. subi r6,r6,1
  56. mtctr r6
  57. stdu r1,-STACKFRAMESIZE(r1)
  58. std r14,STK_REG(R14)(r1)
  59. std r15,STK_REG(R15)(r1)
  60. std r16,STK_REG(R16)(r1)
  61. ld r6,0(r3)
  62. ld r9,8(r3)
  63. ld r10,16(r3)
  64. ld r11,24(r3)
  65. /*
  66. * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  67. * because of the XER dependency. This means the fastest this loop can
  68. * go is 16 cycles per iteration. The scheduling of the loop below has
  69. * been shown to hit this on both POWER6 and POWER7.
  70. */
  71. .align 5
  72. 2:
  73. adde r0,r0,r6
  74. ld r12,32(r3)
  75. ld r14,40(r3)
  76. adde r0,r0,r9
  77. ld r15,48(r3)
  78. ld r16,56(r3)
  79. addi r3,r3,64
  80. adde r0,r0,r10
  81. adde r0,r0,r11
  82. adde r0,r0,r12
  83. adde r0,r0,r14
  84. adde r0,r0,r15
  85. ld r6,0(r3)
  86. ld r9,8(r3)
  87. adde r0,r0,r16
  88. ld r10,16(r3)
  89. ld r11,24(r3)
  90. bdnz 2b
  91. adde r0,r0,r6
  92. ld r12,32(r3)
  93. ld r14,40(r3)
  94. adde r0,r0,r9
  95. ld r15,48(r3)
  96. ld r16,56(r3)
  97. addi r3,r3,64
  98. adde r0,r0,r10
  99. adde r0,r0,r11
  100. adde r0,r0,r12
  101. adde r0,r0,r14
  102. adde r0,r0,r15
  103. adde r0,r0,r16
  104. ld r14,STK_REG(R14)(r1)
  105. ld r15,STK_REG(R15)(r1)
  106. ld r16,STK_REG(R16)(r1)
  107. addi r1,r1,STACKFRAMESIZE
  108. andi. r4,r4,63
  109. .Lcsum_tail_doublewords: /* Up to 127 bytes to go */
  110. srdi. r6,r4,3
  111. beq .Lcsum_tail_word
  112. mtctr r6
  113. 3:
  114. ld r6,0(r3)
  115. addi r3,r3,8
  116. adde r0,r0,r6
  117. bdnz 3b
  118. andi. r4,r4,7
  119. .Lcsum_tail_word: /* Up to 7 bytes to go */
  120. srdi. r6,r4,2
  121. beq .Lcsum_tail_halfword
  122. lwz r6,0(r3)
  123. addi r3,r3,4
  124. adde r0,r0,r6
  125. subi r4,r4,4
  126. .Lcsum_tail_halfword: /* Up to 3 bytes to go */
  127. srdi. r6,r4,1
  128. beq .Lcsum_tail_byte
  129. lhz r6,0(r3)
  130. addi r3,r3,2
  131. adde r0,r0,r6
  132. subi r4,r4,2
  133. .Lcsum_tail_byte: /* Up to 1 byte to go */
  134. andi. r6,r4,1
  135. beq .Lcsum_finish
  136. lbz r6,0(r3)
  137. sldi r9,r6,8 /* Pad the byte out to 16 bits */
  138. adde r0,r0,r9
  139. .Lcsum_finish:
  140. addze r0,r0 /* add in final carry */
  141. rldicl r4,r0,32,0 /* fold two 32 bit halves together */
  142. add r3,r4,r0
  143. srdi r3,r3,32
  144. blr
  145. EXPORT_SYMBOL(__csum_partial)
  146. .macro srcnr
  147. 100:
  148. .section __ex_table,"a"
  149. .align 3
  150. .llong 100b,.Lsrc_error_nr
  151. .previous
  152. .endm
  153. .macro source
  154. 150:
  155. .section __ex_table,"a"
  156. .align 3
  157. .llong 150b,.Lsrc_error
  158. .previous
  159. .endm
  160. .macro dstnr
  161. 200:
  162. .section __ex_table,"a"
  163. .align 3
  164. .llong 200b,.Ldest_error_nr
  165. .previous
  166. .endm
  167. .macro dest
  168. 250:
  169. .section __ex_table,"a"
  170. .align 3
  171. .llong 250b,.Ldest_error
  172. .previous
  173. .endm
  174. /*
  175. * Computes the checksum of a memory block at src, length len,
  176. * and adds in "sum" (32-bit), while copying the block to dst.
  177. * If an access exception occurs on src or dst, it stores -EFAULT
  178. * to *src_err or *dst_err respectively. The caller must take any action
  179. * required in this case (zeroing memory, recalculating partial checksum etc).
  180. *
  181. * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
  182. */
  183. _GLOBAL(csum_partial_copy_generic)
  184. addic r0,r6,0 /* clear carry */
  185. srdi. r6,r5,3 /* less than 8 bytes? */
  186. beq .Lcopy_tail_word
  187. /*
  188. * If only halfword aligned, align to a double word. Since odd
  189. * aligned addresses should be rare and they would require more
  190. * work to calculate the correct checksum, we ignore that case
  191. * and take the potential slowdown of unaligned loads.
  192. *
  193. * If the source and destination are relatively unaligned we only
  194. * align the source. This keeps things simple.
  195. */
  196. rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
  197. beq .Lcopy_aligned
  198. li r9,4
  199. sub r6,r9,r6
  200. mtctr r6
  201. 1:
  202. srcnr; lhz r6,0(r3) /* align to doubleword */
  203. subi r5,r5,2
  204. addi r3,r3,2
  205. adde r0,r0,r6
  206. dstnr; sth r6,0(r4)
  207. addi r4,r4,2
  208. bdnz 1b
  209. .Lcopy_aligned:
  210. /*
  211. * We unroll the loop such that each iteration is 64 bytes with an
  212. * entry and exit limb of 64 bytes, meaning a minimum size of
  213. * 128 bytes.
  214. */
  215. srdi. r6,r5,7
  216. beq .Lcopy_tail_doublewords /* len < 128 */
  217. srdi r6,r5,6
  218. subi r6,r6,1
  219. mtctr r6
  220. stdu r1,-STACKFRAMESIZE(r1)
  221. std r14,STK_REG(R14)(r1)
  222. std r15,STK_REG(R15)(r1)
  223. std r16,STK_REG(R16)(r1)
  224. source; ld r6,0(r3)
  225. source; ld r9,8(r3)
  226. source; ld r10,16(r3)
  227. source; ld r11,24(r3)
  228. /*
  229. * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  230. * because of the XER dependency. This means the fastest this loop can
  231. * go is 16 cycles per iteration. The scheduling of the loop below has
  232. * been shown to hit this on both POWER6 and POWER7.
  233. */
  234. .align 5
  235. 2:
  236. adde r0,r0,r6
  237. source; ld r12,32(r3)
  238. source; ld r14,40(r3)
  239. adde r0,r0,r9
  240. source; ld r15,48(r3)
  241. source; ld r16,56(r3)
  242. addi r3,r3,64
  243. adde r0,r0,r10
  244. dest; std r6,0(r4)
  245. dest; std r9,8(r4)
  246. adde r0,r0,r11
  247. dest; std r10,16(r4)
  248. dest; std r11,24(r4)
  249. adde r0,r0,r12
  250. dest; std r12,32(r4)
  251. dest; std r14,40(r4)
  252. adde r0,r0,r14
  253. dest; std r15,48(r4)
  254. dest; std r16,56(r4)
  255. addi r4,r4,64
  256. adde r0,r0,r15
  257. source; ld r6,0(r3)
  258. source; ld r9,8(r3)
  259. adde r0,r0,r16
  260. source; ld r10,16(r3)
  261. source; ld r11,24(r3)
  262. bdnz 2b
  263. adde r0,r0,r6
  264. source; ld r12,32(r3)
  265. source; ld r14,40(r3)
  266. adde r0,r0,r9
  267. source; ld r15,48(r3)
  268. source; ld r16,56(r3)
  269. addi r3,r3,64
  270. adde r0,r0,r10
  271. dest; std r6,0(r4)
  272. dest; std r9,8(r4)
  273. adde r0,r0,r11
  274. dest; std r10,16(r4)
  275. dest; std r11,24(r4)
  276. adde r0,r0,r12
  277. dest; std r12,32(r4)
  278. dest; std r14,40(r4)
  279. adde r0,r0,r14
  280. dest; std r15,48(r4)
  281. dest; std r16,56(r4)
  282. addi r4,r4,64
  283. adde r0,r0,r15
  284. adde r0,r0,r16
  285. ld r14,STK_REG(R14)(r1)
  286. ld r15,STK_REG(R15)(r1)
  287. ld r16,STK_REG(R16)(r1)
  288. addi r1,r1,STACKFRAMESIZE
  289. andi. r5,r5,63
  290. .Lcopy_tail_doublewords: /* Up to 127 bytes to go */
  291. srdi. r6,r5,3
  292. beq .Lcopy_tail_word
  293. mtctr r6
  294. 3:
  295. srcnr; ld r6,0(r3)
  296. addi r3,r3,8
  297. adde r0,r0,r6
  298. dstnr; std r6,0(r4)
  299. addi r4,r4,8
  300. bdnz 3b
  301. andi. r5,r5,7
  302. .Lcopy_tail_word: /* Up to 7 bytes to go */
  303. srdi. r6,r5,2
  304. beq .Lcopy_tail_halfword
  305. srcnr; lwz r6,0(r3)
  306. addi r3,r3,4
  307. adde r0,r0,r6
  308. dstnr; stw r6,0(r4)
  309. addi r4,r4,4
  310. subi r5,r5,4
  311. .Lcopy_tail_halfword: /* Up to 3 bytes to go */
  312. srdi. r6,r5,1
  313. beq .Lcopy_tail_byte
  314. srcnr; lhz r6,0(r3)
  315. addi r3,r3,2
  316. adde r0,r0,r6
  317. dstnr; sth r6,0(r4)
  318. addi r4,r4,2
  319. subi r5,r5,2
  320. .Lcopy_tail_byte: /* Up to 1 byte to go */
  321. andi. r6,r5,1
  322. beq .Lcopy_finish
  323. srcnr; lbz r6,0(r3)
  324. sldi r9,r6,8 /* Pad the byte out to 16 bits */
  325. adde r0,r0,r9
  326. dstnr; stb r6,0(r4)
  327. .Lcopy_finish:
  328. addze r0,r0 /* add in final carry */
  329. rldicl r4,r0,32,0 /* fold two 32 bit halves together */
  330. add r3,r4,r0
  331. srdi r3,r3,32
  332. blr
  333. .Lsrc_error:
  334. ld r14,STK_REG(R14)(r1)
  335. ld r15,STK_REG(R15)(r1)
  336. ld r16,STK_REG(R16)(r1)
  337. addi r1,r1,STACKFRAMESIZE
  338. .Lsrc_error_nr:
  339. cmpdi 0,r7,0
  340. beqlr
  341. li r6,-EFAULT
  342. stw r6,0(r7)
  343. blr
  344. .Ldest_error:
  345. ld r14,STK_REG(R14)(r1)
  346. ld r15,STK_REG(R15)(r1)
  347. ld r16,STK_REG(R16)(r1)
  348. addi r1,r1,STACKFRAMESIZE
  349. .Ldest_error_nr:
  350. cmpdi 0,r8,0
  351. beqlr
  352. li r6,-EFAULT
  353. stw r6,0(r8)
  354. blr
  355. EXPORT_SYMBOL(csum_partial_copy_generic)