ip_fast_csum.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * Optmized version of the ip_fast_csum() function
  4. * Used for calculating IP header checksum
  5. *
  6. * Return: 16bit checksum, complemented
  7. *
  8. * Inputs:
  9. * in0: address of buffer to checksum (char *)
  10. * in1: length of the buffer (int)
  11. *
  12. * Copyright (C) 2002, 2006 Intel Corp.
  13. * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
  14. */
  15. #include <asm/asmmacro.h>
  16. #include <asm/export.h>
  17. /*
  18. * Since we know that most likely this function is called with buf aligned
  19. * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
  20. * versus calling generic version of do_csum, which has lots of overhead in
  21. * handling various alignments and sizes. However, due to lack of constrains
  22. * put on the function input argument, cases with alignment not on 4-byte or
  23. * size not equal to 20 bytes will be handled by the generic do_csum function.
  24. */
  25. #define in0 r32
  26. #define in1 r33
  27. #define in2 r34
  28. #define in3 r35
  29. #define in4 r36
  30. #define ret0 r8
  31. GLOBAL_ENTRY(ip_fast_csum)
  32. .prologue
  33. .body
  34. cmp.ne p6,p7=5,in1 // size other than 20 byte?
  35. and r14=3,in0 // is it aligned on 4-byte?
  36. add r15=4,in0 // second source pointer
  37. ;;
  38. cmp.ne.or.andcm p6,p7=r14,r0
  39. ;;
  40. (p7) ld4 r20=[in0],8
  41. (p7) ld4 r21=[r15],8
  42. (p6) br.spnt .generic
  43. ;;
  44. ld4 r22=[in0],8
  45. ld4 r23=[r15],8
  46. ;;
  47. ld4 r24=[in0]
  48. add r20=r20,r21
  49. add r22=r22,r23
  50. ;;
  51. add r20=r20,r22
  52. ;;
  53. add r20=r20,r24
  54. ;;
  55. shr.u ret0=r20,16 // now need to add the carry
  56. zxt2 r20=r20
  57. ;;
  58. add r20=ret0,r20
  59. ;;
  60. shr.u ret0=r20,16 // add carry again
  61. zxt2 r20=r20
  62. ;;
  63. add r20=ret0,r20
  64. ;;
  65. shr.u ret0=r20,16
  66. zxt2 r20=r20
  67. ;;
  68. add r20=ret0,r20
  69. mov r9=0xffff
  70. ;;
  71. andcm ret0=r9,r20
  72. .restore sp // reset frame state
  73. br.ret.sptk.many b0
  74. ;;
  75. .generic:
  76. .prologue
  77. .save ar.pfs, r35
  78. alloc r35=ar.pfs,2,2,2,0
  79. .save rp, r34
  80. mov r34=b0
  81. .body
  82. dep.z out1=in1,2,30
  83. mov out0=in0
  84. ;;
  85. br.call.sptk.many b0=do_csum
  86. ;;
  87. andcm ret0=-1,ret0
  88. mov ar.pfs=r35
  89. mov b0=r34
  90. br.ret.sptk.many b0
  91. END(ip_fast_csum)
  92. EXPORT_SYMBOL(ip_fast_csum)
  93. GLOBAL_ENTRY(csum_ipv6_magic)
  94. ld4 r20=[in0],4
  95. ld4 r21=[in1],4
  96. zxt4 in2=in2
  97. ;;
  98. ld4 r22=[in0],4
  99. ld4 r23=[in1],4
  100. dep r15=in3,in2,32,16
  101. ;;
  102. ld4 r24=[in0],4
  103. ld4 r25=[in1],4
  104. mux1 r15=r15,@rev
  105. add r16=r20,r21
  106. add r17=r22,r23
  107. zxt4 in4=in4
  108. ;;
  109. ld4 r26=[in0],4
  110. ld4 r27=[in1],4
  111. shr.u r15=r15,16
  112. add r18=r24,r25
  113. add r8=r16,r17
  114. ;;
  115. add r19=r26,r27
  116. add r8=r8,r18
  117. ;;
  118. add r8=r8,r19
  119. add r15=r15,in4
  120. ;;
  121. add r8=r8,r15
  122. ;;
  123. shr.u r10=r8,32 // now fold sum into short
  124. zxt4 r11=r8
  125. ;;
  126. add r8=r10,r11
  127. ;;
  128. shr.u r10=r8,16 // yeah, keep it rolling
  129. zxt2 r11=r8
  130. ;;
  131. add r8=r10,r11
  132. ;;
  133. shr.u r10=r8,16 // three times lucky
  134. zxt2 r11=r8
  135. ;;
  136. add r8=r10,r11
  137. mov r9=0xffff
  138. ;;
  139. andcm r8=r9,r8
  140. br.ret.sptk.many b0
  141. END(csum_ipv6_magic)
  142. EXPORT_SYMBOL(csum_ipv6_magic)