goc25519sm_amd64.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. // Copyright © 2021 Jeffrey H. Johnson <trnsz@pobox.com>.
  2. // Copyright © 2021 Gridfinity, LLC.
  3. // Copyright © 2021 Filippo Valsorda.
  4. // Copyright © 2012 The Go Authors.
  5. //
  6. // All rights reserved.
  7. //
  8. // Use of this source code is governed by the BSD-style
  9. // license that can be found in the LICENSE file.
  10. //
  11. //go:build amd64 && gc && !purego
  12. // +build amd64,gc,!purego
  13. package goc25519sm
  14. import (
  15. "fmt"
  16. )
  17. // These functions are implemented in the '.s' files
  18. // Naming is analogous to the SUPERCOP implementation
  19. //go:noescape
  20. func cswap(
  21. inout *[5]uint64,
  22. v uint64,
  23. )
  24. //go:noescape
  25. func ladderstep(
  26. inout *[5][5]uint64,
  27. )
  28. //go:noescape
  29. func freeze(
  30. inout *[5]uint64,
  31. )
  32. //go:noescape
  33. func mul(
  34. dest,
  35. a,
  36. b *[5]uint64,
  37. )
  38. //go:noescape
  39. func square(
  40. out,
  41. in *[5]uint64,
  42. )
  43. // mladder implements a Montgomery ladder
  44. // to calculate ( ( 'xr'/'zr' ) *= 's' )
  45. func mladder(
  46. xr,
  47. zr *[5]uint64,
  48. s *[X25519Size]byte,
  49. ) {
  50. var work [5][5]uint64
  51. work[0] = *xr
  52. setint(
  53. &work[1],
  54. 1,
  55. )
  56. setint(
  57. &work[2],
  58. 0,
  59. )
  60. work[3] = *xr
  61. setint(
  62. &work[4],
  63. 1,
  64. )
  65. j := uint(6)
  66. var prevbit byte
  67. for i := 31; i >= 0; i-- {
  68. for j < 8 {
  69. bit := ((*s)[i] >> j) & 1
  70. swap := bit ^ prevbit
  71. prevbit = bit
  72. cswap(
  73. &work[1],
  74. uint64(swap),
  75. )
  76. ladderstep(
  77. &work,
  78. )
  79. j--
  80. }
  81. j = 7
  82. }
  83. *xr = work[1]
  84. *zr = work[2]
  85. }
  86. func oldScalarMult(
  87. dst,
  88. scalar,
  89. base *[X25519Size]byte,
  90. ) error {
  91. var e [X25519Size]byte
  92. var err error
  93. // Dubious to perform clamping at this stage,
  94. // but the behavior matches that of libsodium
  95. copy(
  96. e[:],
  97. (*scalar)[:],
  98. )
  99. e[0] &= 248
  100. e[31] &= 127
  101. e[31] |= 64
  102. var t,
  103. z [5]uint64
  104. unpack(
  105. &t,
  106. base,
  107. )
  108. mladder(
  109. &t,
  110. &z,
  111. &e,
  112. )
  113. invert(
  114. &z,
  115. &z,
  116. )
  117. mul(
  118. &t,
  119. &t,
  120. &z,
  121. )
  122. pack(
  123. dst,
  124. &t,
  125. )
  126. err = oldScalarMultVerify(
  127. dst,
  128. scalar,
  129. base,
  130. )
  131. if err != nil {
  132. return fmt.Errorf(
  133. "\ngoc25519sm.oldScalarMult.OldScalarMult_amd64.oldScalarMultVerify FAILURE:\n\tdst=%v\n\tscalar=%v\n\tbase=%v\n\t%w",
  134. *dst,
  135. *scalar,
  136. *base,
  137. err,
  138. )
  139. }
  140. return nil
  141. }
  142. func setint(
  143. r *[5]uint64,
  144. v uint64,
  145. ) {
  146. r[0] = v
  147. r[1] = 0
  148. r[2] = 0
  149. r[3] = 0
  150. r[4] = 0
  151. }
  152. // unpack sets 'r' = 'x' where 'r' consists of
  153. // five 51-bit limbs (in little-endian order)
  154. func unpack(
  155. r *[5]uint64,
  156. x *[X25519Size]byte,
  157. ) {
  158. r[0] = uint64(x[0]) |
  159. uint64(x[1])<<8 |
  160. uint64(x[2])<<16 |
  161. uint64(x[3])<<24 |
  162. uint64(x[4])<<32 |
  163. uint64(x[5])<<40 |
  164. uint64(x[6]&7)<<48
  165. r[1] = uint64(x[6])>>3 |
  166. uint64(x[7])<<5 |
  167. uint64(x[8])<<13 |
  168. uint64(x[9])<<21 |
  169. uint64(x[10])<<29 |
  170. uint64(x[11])<<37 |
  171. uint64(x[12]&63)<<45
  172. r[2] = uint64(x[12])>>6 |
  173. uint64(x[13])<<2 |
  174. uint64(x[14])<<10 |
  175. uint64(x[15])<<18 |
  176. uint64(x[16])<<26 |
  177. uint64(x[17])<<34 |
  178. uint64(x[18])<<42 |
  179. uint64(x[19]&1)<<50
  180. r[3] = uint64(x[19])>>1 |
  181. uint64(x[20])<<7 |
  182. uint64(x[21])<<15 |
  183. uint64(x[22])<<23 |
  184. uint64(x[23])<<31 |
  185. uint64(x[24])<<39 |
  186. uint64(x[25]&15)<<47
  187. r[4] = uint64(x[25])>>4 |
  188. uint64(x[26])<<4 |
  189. uint64(x[27])<<12 |
  190. uint64(x[28])<<20 |
  191. uint64(x[29])<<28 |
  192. uint64(x[30])<<36 |
  193. uint64(x[31]&127)<<44
  194. }
  195. // pack sets 'out' = 'x' where 'out' is the standard
  196. // little-endian form of the five 51-bit limbs in 'x'
  197. func pack(
  198. out *[X25519Size]byte,
  199. x *[5]uint64,
  200. ) {
  201. t := *x
  202. freeze(
  203. &t,
  204. )
  205. out[0] = byte(t[0])
  206. out[1] = byte(t[0] >> 8)
  207. out[2] = byte(t[0] >> 16)
  208. out[3] = byte(t[0] >> 24)
  209. out[4] = byte(t[0] >> 32)
  210. out[5] = byte(t[0] >> 40)
  211. out[6] = byte(t[0] >> 48)
  212. out[6] ^= byte(t[1]<<3) & 0xf8
  213. out[7] = byte(t[1] >> 5)
  214. out[8] = byte(t[1] >> 13)
  215. out[9] = byte(t[1] >> 21)
  216. out[10] = byte(t[1] >> 29)
  217. out[11] = byte(t[1] >> 37)
  218. out[12] = byte(t[1] >> 45)
  219. out[12] ^= byte(t[2]<<6) & 0xc0
  220. out[13] = byte(t[2] >> 2)
  221. out[14] = byte(t[2] >> 10)
  222. out[15] = byte(t[2] >> 18)
  223. out[16] = byte(t[2] >> 26)
  224. out[17] = byte(t[2] >> 34)
  225. out[18] = byte(t[2] >> 42)
  226. out[19] = byte(t[2] >> 50)
  227. out[19] ^= byte(t[3]<<1) & 0xfe
  228. out[20] = byte(t[3] >> 7)
  229. out[21] = byte(t[3] >> 15)
  230. out[22] = byte(t[3] >> 23)
  231. out[23] = byte(t[3] >> 31)
  232. out[24] = byte(t[3] >> 39)
  233. out[25] = byte(t[3] >> 47)
  234. out[25] ^= byte(t[4]<<4) & 0xf0
  235. out[26] = byte(t[4] >> 4)
  236. out[27] = byte(t[4] >> 12)
  237. out[28] = byte(t[4] >> 20)
  238. out[29] = byte(t[4] >> 28)
  239. out[30] = byte(t[4] >> 36)
  240. out[31] = byte(t[4] >> 44)
  241. }
  242. // invert calculates 'r' = (('x'^-1) mod 'p')
  243. // using Fermat's little theorem
  244. func invert(
  245. r,
  246. x *[5]uint64,
  247. ) {
  248. var z2,
  249. z9,
  250. z11,
  251. z2_5_0,
  252. z2_10_0,
  253. z2_20_0,
  254. z2_50_0,
  255. z2_100_0,
  256. t [5]uint64
  257. square(
  258. &z2,
  259. x,
  260. ) // 2
  261. square(
  262. &t,
  263. &z2,
  264. ) // 4
  265. square(
  266. &t,
  267. &t,
  268. ) // 8
  269. mul(
  270. &z9,
  271. &t,
  272. x,
  273. ) // 9
  274. mul(
  275. &z11,
  276. &z9,
  277. &z2,
  278. ) // 11
  279. square(
  280. &t,
  281. &z11,
  282. ) // 22
  283. mul(
  284. &z2_5_0,
  285. &t,
  286. &z9,
  287. ) // 2^5 - 2^0 = 31
  288. square(
  289. &t,
  290. &z2_5_0,
  291. ) // 2^6 - 2^1
  292. for i := 1; i < 5; i++ {
  293. square(
  294. &t,
  295. &t,
  296. )
  297. } // 2^20 - 2^10
  298. mul(
  299. &z2_10_0,
  300. &t,
  301. &z2_5_0,
  302. ) // 2^10 - 2^0
  303. square(
  304. &t,
  305. &z2_10_0,
  306. ) // 2^11 - 2^1
  307. for i := 1; i < 10; i++ {
  308. square(
  309. &t,
  310. &t,
  311. )
  312. } // 2^20 - 2^10
  313. mul(
  314. &z2_20_0,
  315. &t,
  316. &z2_10_0,
  317. ) // 2^20 - 2^0
  318. square(
  319. &t,
  320. &z2_20_0,
  321. ) // 2^21 - 2^1
  322. for i := 1; i < 20; i++ {
  323. square(
  324. &t,
  325. &t,
  326. )
  327. } // 2^40 - 2^20
  328. mul(
  329. &t,
  330. &t,
  331. &z2_20_0,
  332. ) // 2^40 - 2^0
  333. square(
  334. &t,
  335. &t,
  336. ) // 2^41 - 2^1
  337. for i := 1; i < 10; i++ {
  338. square(
  339. &t,
  340. &t,
  341. )
  342. } // 2^50 - 2^10
  343. mul(
  344. &z2_50_0,
  345. &t,
  346. &z2_10_0,
  347. ) // 2^50 - 2^0
  348. square(
  349. &t,
  350. &z2_50_0,
  351. ) // 2^51 - 2^1
  352. for i := 1; i < 50; i++ {
  353. square(
  354. &t,
  355. &t,
  356. )
  357. } // 2^100 - 2^50
  358. mul(
  359. &z2_100_0,
  360. &t,
  361. &z2_50_0,
  362. ) // 2^100 - 2^0
  363. square(
  364. &t,
  365. &z2_100_0,
  366. ) // 2^101 - 2^1
  367. for i := 1; i < 100; i++ {
  368. square(
  369. &t,
  370. &t,
  371. )
  372. } // 2^200 - 2^100
  373. mul(
  374. &t,
  375. &t,
  376. &z2_100_0,
  377. ) // 2^200 - 2^0
  378. square(
  379. &t,
  380. &t,
  381. ) // 2^201 - 2^1
  382. for i := 1; i < 50; i++ {
  383. square(
  384. &t,
  385. &t,
  386. )
  387. } // 2^250 - 2^50
  388. mul(
  389. &t,
  390. &t,
  391. &z2_50_0,
  392. ) // 2^250 - 2^0
  393. square(
  394. &t,
  395. &t,
  396. ) // 2^251 - 2^1
  397. square(
  398. &t,
  399. &t,
  400. ) // 2^252 - 2^2
  401. square(
  402. &t,
  403. &t,
  404. ) // 2^253 - 2^3
  405. square(
  406. &t,
  407. &t,
  408. ) // 2^254 - 2^4
  409. square(
  410. &t,
  411. &t,
  412. ) // 2^255 - 2^5
  413. mul(
  414. r,
  415. &t,
  416. &z11,
  417. ) // 2^255 - 21
  418. }