goc25519sm_generic.go 22 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042
  1. // Copyright © 2021 Jeffrey H. Johnson <trnsz@pobox.com>.
  2. // Copyright © 2021 Gridfinity, LLC.
  3. // Copyright © 2013 The Go Authors.
  4. // All rights reserved.
  5. //
  6. // Use of this source code is governed by the BSD-style
  7. // license that can be found in the LICENSE file.
  8. package goc25519sm // import "github.com/johnsonjh/goc25519sm"
  9. // This code is a port of the public domain, "ref10" implementation of
  10. // Curve25519 from SUPERCOP 20130419 by D. J. Bernstein.
  11. // fieldElement represents an element of the field GF(2^255 - 19).
  12. // An element t, entries t[0]...t[9], represents the integer
  13. // t[0]+2^26 t[1]+2^51 t[2]+2^77 t[3]+2^102 t[4]+...+2^230 t[9].
  14. // Bounds on each t[i] vary depending on context.
  15. type fieldElement [10]int32
  16. var zero fieldElement
  17. func feZero(
  18. fe *fieldElement,
  19. ) {
  20. copy(
  21. fe[:],
  22. zero[:],
  23. )
  24. }
  25. func feOne(
  26. fe *fieldElement,
  27. ) {
  28. feZero(
  29. fe,
  30. )
  31. fe[0] = 1
  32. }
  33. func feAdd(
  34. dst,
  35. a,
  36. b *fieldElement,
  37. ) {
  38. for i := range dst {
  39. dst[i] = a[i] + b[i]
  40. }
  41. }
  42. func feSub(
  43. dst,
  44. a,
  45. b *fieldElement,
  46. ) {
  47. for i := range dst {
  48. dst[i] = a[i] - b[i]
  49. }
  50. }
  51. func feCopy(
  52. dst,
  53. src *fieldElement,
  54. ) {
  55. copy(
  56. dst[:],
  57. src[:],
  58. )
  59. }
  60. // feCSwap replaces (f,g) with (g,f) if b == 1;
  61. // replaces (f,g) with (f,g) if b == 0.
  62. // Preconditions: b in {0,1}.
  63. func feCSwap(
  64. f,
  65. g *fieldElement,
  66. b int32,
  67. ) {
  68. b = -b
  69. for i := range f {
  70. t := b & (f[i] ^ g[i])
  71. f[i] ^= t
  72. g[i] ^= t
  73. }
  74. }
  75. func load3(
  76. scalar []byte,
  77. ) int64 {
  78. var r int64
  79. r = int64(scalar[0])
  80. r |= int64(scalar[1]) << 8
  81. r |= int64(scalar[2]) << 16
  82. return r
  83. }
  84. func load4(
  85. scalar []byte,
  86. ) int64 {
  87. var r int64
  88. r = int64(scalar[0])
  89. r |= int64(scalar[1]) << 8
  90. r |= int64(scalar[2]) << 16
  91. r |= int64(scalar[3]) << 24
  92. return r
  93. }
  94. func feFromBytes(
  95. dst *fieldElement,
  96. src *[X25519Size]byte,
  97. ) {
  98. h0 := load4(
  99. src[:],
  100. )
  101. h1 := load3(
  102. src[4:],
  103. ) << 6
  104. h2 := load3(
  105. src[7:],
  106. ) << 5
  107. h3 := load3(
  108. src[10:],
  109. ) << 3
  110. h4 := load3(
  111. src[13:],
  112. ) << 2
  113. h5 := load4(
  114. src[16:],
  115. )
  116. h6 := load3(
  117. src[20:],
  118. ) << 7
  119. h7 := load3(
  120. src[23:],
  121. ) << 5
  122. h8 := load3(
  123. src[26:],
  124. ) << 4
  125. h9 := (load3(
  126. src[29:],
  127. ) & 0x7fffff) << 2
  128. var carry [10]int64
  129. carry[9] = (h9 + 1<<24) >> 25
  130. h0 += carry[9] * 19
  131. h9 -= carry[9] << 25
  132. carry[1] = (h1 + 1<<24) >> 25
  133. h2 += carry[1]
  134. h1 -= carry[1] << 25
  135. carry[3] = (h3 + 1<<24) >> 25
  136. h4 += carry[3]
  137. h3 -= carry[3] << 25
  138. carry[5] = (h5 + 1<<24) >> 25
  139. h6 += carry[5]
  140. h5 -= carry[5] << 25
  141. carry[7] = (h7 + 1<<24) >> 25
  142. h8 += carry[7]
  143. h7 -= carry[7] << 25
  144. carry[0] = (h0 + 1<<25) >> 26
  145. h1 += carry[0]
  146. h0 -= carry[0] << 26
  147. carry[2] = (h2 + 1<<25) >> 26
  148. h3 += carry[2]
  149. h2 -= carry[2] << 26
  150. carry[4] = (h4 + 1<<25) >> 26
  151. h5 += carry[4]
  152. h4 -= carry[4] << 26
  153. carry[6] = (h6 + 1<<25) >> 26
  154. h7 += carry[6]
  155. h6 -= carry[6] << 26
  156. carry[8] = (h8 + 1<<25) >> 26
  157. h9 += carry[8]
  158. h8 -= carry[8] << 26
  159. dst[0] = int32(h0)
  160. dst[1] = int32(h1)
  161. dst[2] = int32(h2)
  162. dst[3] = int32(h3)
  163. dst[4] = int32(h4)
  164. dst[5] = int32(h5)
  165. dst[6] = int32(h6)
  166. dst[7] = int32(h7)
  167. dst[8] = int32(h8)
  168. dst[9] = int32(h9)
  169. }
  170. // feToBytes marshals h to s.
  171. // Preconditions:
  172. // |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24, etc.
  173. // Write p=2^255-19; q=floor(h/p).
  174. // Basic Claim:
  175. // q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
  176. // Proof:
  177. // Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
  178. // Also have |h-2^230 h9|<2^230 so |19 2^(-255)(h-2^230 h9)|<1/4.
  179. // Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
  180. // Then 0<y<1.
  181. // Write r=h-pq.
  182. // Have 0<=r<=p-1=2^255-20.
  183. // Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
  184. // Write x=r+19(2^-255)r+y.
  185. // Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
  186. // Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
  187. // So floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
  188. func feToBytes(
  189. s *[X25519Size]byte,
  190. h *fieldElement,
  191. ) {
  192. var carry [10]int32
  193. q := (19*h[9] + (1 << 24)) >> 25
  194. q = (h[0] + q) >> 26
  195. q = (h[1] + q) >> 25
  196. q = (h[2] + q) >> 26
  197. q = (h[3] + q) >> 25
  198. q = (h[4] + q) >> 26
  199. q = (h[5] + q) >> 25
  200. q = (h[6] + q) >> 26
  201. q = (h[7] + q) >> 25
  202. q = (h[8] + q) >> 26
  203. q = (h[9] + q) >> 25
  204. // Goal: Output h-(2^255-19)q,
  205. // which is between 0 and 2^255-20.
  206. h[0] += 19 * q
  207. // Goal: Output h-2^255 q,
  208. // which is between 0 and 2^255-20.
  209. carry[0] = h[0] >> 26
  210. h[1] += carry[0]
  211. h[0] -= carry[0] << 26
  212. carry[1] = h[1] >> 25
  213. h[2] += carry[1]
  214. h[1] -= carry[1] << 25
  215. carry[2] = h[2] >> 26
  216. h[3] += carry[2]
  217. h[2] -= carry[2] << 26
  218. carry[3] = h[3] >> 25
  219. h[4] += carry[3]
  220. h[3] -= carry[3] << 25
  221. carry[4] = h[4] >> 26
  222. h[5] += carry[4]
  223. h[4] -= carry[4] << 26
  224. carry[5] = h[5] >> 25
  225. h[6] += carry[5]
  226. h[5] -= carry[5] << 25
  227. carry[6] = h[6] >> 26
  228. h[7] += carry[6]
  229. h[6] -= carry[6] << 26
  230. carry[7] = h[7] >> 25
  231. h[8] += carry[7]
  232. h[7] -= carry[7] << 25
  233. carry[8] = h[8] >> 26
  234. h[9] += carry[8]
  235. h[8] -= carry[8] << 26
  236. carry[9] = h[9] >> 25
  237. h[9] -= carry[9] << 25
  238. // h10 = carry9
  239. // Goal: Output h[0]+...+2^255 h10-2^255 q,
  240. // which is between 0 and 2^255-20.
  241. // Have h[0]+...+2^230 h[9] between 0 and 2^255-1;
  242. // evidently 2^255 h10-2^255 q = 0.
  243. // Goal: Output h[0]+...+2^230 h[9].
  244. s[0] = byte(h[0] >> 0)
  245. s[1] = byte(h[0] >> 8)
  246. s[2] = byte(h[0] >> 16)
  247. s[3] = byte((h[0] >> 24) | (h[1] << 2))
  248. s[4] = byte(h[1] >> 6)
  249. s[5] = byte(h[1] >> 14)
  250. s[6] = byte((h[1] >> 22) | (h[2] << 3))
  251. s[7] = byte(h[2] >> 5)
  252. s[8] = byte(h[2] >> 13)
  253. s[9] = byte((h[2] >> 21) | (h[3] << 5))
  254. s[10] = byte(h[3] >> 3)
  255. s[11] = byte(h[3] >> 11)
  256. s[12] = byte((h[3] >> 19) | (h[4] << 6))
  257. s[13] = byte(h[4] >> 2)
  258. s[14] = byte(h[4] >> 10)
  259. s[15] = byte(h[4] >> 18)
  260. s[16] = byte(h[5] >> 0)
  261. s[17] = byte(h[5] >> 8)
  262. s[18] = byte(h[5] >> 16)
  263. s[19] = byte((h[5] >> 24) | (h[6] << 1))
  264. s[20] = byte(h[6] >> 7)
  265. s[21] = byte(h[6] >> 15)
  266. s[22] = byte((h[6] >> 23) | (h[7] << 3))
  267. s[23] = byte(h[7] >> 5)
  268. s[24] = byte(h[7] >> 13)
  269. s[25] = byte((h[7] >> 21) | (h[8] << 4))
  270. s[26] = byte(h[8] >> 4)
  271. s[27] = byte(h[8] >> 12)
  272. s[28] = byte((h[8] >> 20) | (h[9] << 6))
  273. s[29] = byte(h[9] >> 2)
  274. s[30] = byte(h[9] >> 10)
  275. s[31] = byte(h[9] >> 18)
  276. }
  277. // feMul calculates h = f * g
  278. // Can overlap h with f or g.
  279. // Preconditions:
  280. // |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
  281. // |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
  282. // Postconditions:
  283. // |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  284. // Notes on implementation strategy:
  285. // * Using schoolbook multiplication.
  286. // * Karatsuba **would** save a little - in some cost models.
  287. // * Most multiplications by 2 and 19 are 32-bit precomputations;
  288. // these are explictly cheaper than 64-bit postcomputations.
  289. // * There is one remaining multiplication by 19 in the carry chain;
  290. // one *19 precomputation can be merged into this, but the resulting
  291. // data flow is considerably less clean.
  292. // * There are 12 carries below. 10 of them are 2-way parallelizable
  293. // and vectorizable. Can get away with 11 carries, but then data
  294. // flow is much deeper.
  295. // * With tighter constraints on inputs, can squeeze carries into int32.
  296. func feMul(
  297. h,
  298. f,
  299. g *fieldElement,
  300. ) {
  301. f0 := f[0]
  302. f1 := f[1]
  303. f2 := f[2]
  304. f3 := f[3]
  305. f4 := f[4]
  306. f5 := f[5]
  307. f6 := f[6]
  308. f7 := f[7]
  309. f8 := f[8]
  310. f9 := f[9]
  311. g0 := g[0]
  312. g1 := g[1]
  313. g2 := g[2]
  314. g3 := g[3]
  315. g4 := g[4]
  316. g5 := g[5]
  317. g6 := g[6]
  318. g7 := g[7]
  319. g8 := g[8]
  320. g9 := g[9]
  321. g1_19 := 19 * g1
  322. g2_19 := 19 * g2
  323. g3_19 := 19 * g3
  324. g4_19 := 19 * g4
  325. g5_19 := 19 * g5
  326. g6_19 := 19 * g6
  327. g7_19 := 19 * g7
  328. g8_19 := 19 * g8
  329. g9_19 := 19 * g9
  330. f1_2 := 2 * f1
  331. f3_2 := 2 * f3
  332. f5_2 := 2 * f5
  333. f7_2 := 2 * f7
  334. f9_2 := 2 * f9
  335. f0g0 := int64(f0) * int64(g0)
  336. f0g1 := int64(f0) * int64(g1)
  337. f0g2 := int64(f0) * int64(g2)
  338. f0g3 := int64(f0) * int64(g3)
  339. f0g4 := int64(f0) * int64(g4)
  340. f0g5 := int64(f0) * int64(g5)
  341. f0g6 := int64(f0) * int64(g6)
  342. f0g7 := int64(f0) * int64(g7)
  343. f0g8 := int64(f0) * int64(g8)
  344. f0g9 := int64(f0) * int64(g9)
  345. f1g0 := int64(f1) * int64(g0)
  346. f1g1_2 := int64(f1_2) * int64(g1)
  347. f1g2 := int64(f1) * int64(g2)
  348. f1g3_2 := int64(f1_2) * int64(g3)
  349. f1g4 := int64(f1) * int64(g4)
  350. f1g5_2 := int64(f1_2) * int64(g5)
  351. f1g6 := int64(f1) * int64(g6)
  352. f1g7_2 := int64(f1_2) * int64(g7)
  353. f1g8 := int64(f1) * int64(g8)
  354. f1g9_38 := int64(f1_2) * int64(g9_19)
  355. f2g0 := int64(f2) * int64(g0)
  356. f2g1 := int64(f2) * int64(g1)
  357. f2g2 := int64(f2) * int64(g2)
  358. f2g3 := int64(f2) * int64(g3)
  359. f2g4 := int64(f2) * int64(g4)
  360. f2g5 := int64(f2) * int64(g5)
  361. f2g6 := int64(f2) * int64(g6)
  362. f2g7 := int64(f2) * int64(g7)
  363. f2g8_19 := int64(f2) * int64(g8_19)
  364. f2g9_19 := int64(f2) * int64(g9_19)
  365. f3g0 := int64(f3) * int64(g0)
  366. f3g1_2 := int64(f3_2) * int64(g1)
  367. f3g2 := int64(f3) * int64(g2)
  368. f3g3_2 := int64(f3_2) * int64(g3)
  369. f3g4 := int64(f3) * int64(g4)
  370. f3g5_2 := int64(f3_2) * int64(g5)
  371. f3g6 := int64(f3) * int64(g6)
  372. f3g7_38 := int64(f3_2) * int64(g7_19)
  373. f3g8_19 := int64(f3) * int64(g8_19)
  374. f3g9_38 := int64(f3_2) * int64(g9_19)
  375. f4g0 := int64(f4) * int64(g0)
  376. f4g1 := int64(f4) * int64(g1)
  377. f4g2 := int64(f4) * int64(g2)
  378. f4g3 := int64(f4) * int64(g3)
  379. f4g4 := int64(f4) * int64(g4)
  380. f4g5 := int64(f4) * int64(g5)
  381. f4g6_19 := int64(f4) * int64(g6_19)
  382. f4g7_19 := int64(f4) * int64(g7_19)
  383. f4g8_19 := int64(f4) * int64(g8_19)
  384. f4g9_19 := int64(f4) * int64(g9_19)
  385. f5g0 := int64(f5) * int64(g0)
  386. f5g1_2 := int64(f5_2) * int64(g1)
  387. f5g2 := int64(f5) * int64(g2)
  388. f5g3_2 := int64(f5_2) * int64(g3)
  389. f5g4 := int64(f5) * int64(g4)
  390. f5g5_38 := int64(f5_2) * int64(g5_19)
  391. f5g6_19 := int64(f5) * int64(g6_19)
  392. f5g7_38 := int64(f5_2) * int64(g7_19)
  393. f5g8_19 := int64(f5) * int64(g8_19)
  394. f5g9_38 := int64(f5_2) * int64(g9_19)
  395. f6g0 := int64(f6) * int64(g0)
  396. f6g1 := int64(f6) * int64(g1)
  397. f6g2 := int64(f6) * int64(g2)
  398. f6g3 := int64(f6) * int64(g3)
  399. f6g4_19 := int64(f6) * int64(g4_19)
  400. f6g5_19 := int64(f6) * int64(g5_19)
  401. f6g6_19 := int64(f6) * int64(g6_19)
  402. f6g7_19 := int64(f6) * int64(g7_19)
  403. f6g8_19 := int64(f6) * int64(g8_19)
  404. f6g9_19 := int64(f6) * int64(g9_19)
  405. f7g0 := int64(f7) * int64(g0)
  406. f7g1_2 := int64(f7_2) * int64(g1)
  407. f7g2 := int64(f7) * int64(g2)
  408. f7g3_38 := int64(f7_2) * int64(g3_19)
  409. f7g4_19 := int64(f7) * int64(g4_19)
  410. f7g5_38 := int64(f7_2) * int64(g5_19)
  411. f7g6_19 := int64(f7) * int64(g6_19)
  412. f7g7_38 := int64(f7_2) * int64(g7_19)
  413. f7g8_19 := int64(f7) * int64(g8_19)
  414. f7g9_38 := int64(f7_2) * int64(g9_19)
  415. f8g0 := int64(f8) * int64(g0)
  416. f8g1 := int64(f8) * int64(g1)
  417. f8g2_19 := int64(f8) * int64(g2_19)
  418. f8g3_19 := int64(f8) * int64(g3_19)
  419. f8g4_19 := int64(f8) * int64(g4_19)
  420. f8g5_19 := int64(f8) * int64(g5_19)
  421. f8g6_19 := int64(f8) * int64(g6_19)
  422. f8g7_19 := int64(f8) * int64(g7_19)
  423. f8g8_19 := int64(f8) * int64(g8_19)
  424. f8g9_19 := int64(f8) * int64(g9_19)
  425. f9g0 := int64(f9) * int64(g0)
  426. f9g1_38 := int64(f9_2) * int64(g1_19)
  427. f9g2_19 := int64(f9) * int64(g2_19)
  428. f9g3_38 := int64(f9_2) * int64(g3_19)
  429. f9g4_19 := int64(f9) * int64(g4_19)
  430. f9g5_38 := int64(f9_2) * int64(g5_19)
  431. f9g6_19 := int64(f9) * int64(g6_19)
  432. f9g7_38 := int64(f9_2) * int64(g7_19)
  433. f9g8_19 := int64(f9) * int64(g8_19)
  434. f9g9_38 := int64(f9_2) * int64(g9_19)
  435. h0 := f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38
  436. h1 := f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19
  437. h2 := f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38
  438. h3 := f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19
  439. h4 := f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38
  440. h5 := f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19
  441. h6 := f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38
  442. h7 := f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19
  443. h8 := f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38
  444. h9 := f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0
  445. var carry [10]int64
  446. carry[0] = (h0 + (1 << 25)) >> 26
  447. h1 += carry[0]
  448. h0 -= carry[0] << 26
  449. carry[4] = (h4 + (1 << 25)) >> 26
  450. h5 += carry[4]
  451. h4 -= carry[4] << 26
  452. carry[1] = (h1 + (1 << 24)) >> 25
  453. h2 += carry[1]
  454. h1 -= carry[1] << 25
  455. carry[5] = (h5 + (1 << 24)) >> 25
  456. h6 += carry[5]
  457. h5 -= carry[5] << 25
  458. carry[2] = (h2 + (1 << 25)) >> 26
  459. h3 += carry[2]
  460. h2 -= carry[2] << 26
  461. carry[6] = (h6 + (1 << 25)) >> 26
  462. h7 += carry[6]
  463. h6 -= carry[6] << 26
  464. carry[3] = (h3 + (1 << 24)) >> 25
  465. h4 += carry[3]
  466. h3 -= carry[3] << 25
  467. carry[7] = (h7 + (1 << 24)) >> 25
  468. h8 += carry[7]
  469. h7 -= carry[7] << 25
  470. carry[4] = (h4 + (1 << 25)) >> 26
  471. h5 += carry[4]
  472. h4 -= carry[4] << 26
  473. carry[8] = (h8 + (1 << 25)) >> 26
  474. h9 += carry[8]
  475. h8 -= carry[8] << 26
  476. carry[9] = (h9 + (1 << 24)) >> 25
  477. h0 += carry[9] * 19
  478. h9 -= carry[9] << 25
  479. carry[0] = (h0 + (1 << 25)) >> 26
  480. h1 += carry[0]
  481. h0 -= carry[0] << 26
  482. h[0] = int32(h0)
  483. h[1] = int32(h1)
  484. h[2] = int32(h2)
  485. h[3] = int32(h3)
  486. h[4] = int32(h4)
  487. h[5] = int32(h5)
  488. h[6] = int32(h6)
  489. h[7] = int32(h7)
  490. h[8] = int32(h8)
  491. h[9] = int32(h9)
  492. }
  493. // feSquare calculates h = f*f. Can overlap h with f.
  494. // Preconditions:
  495. // |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
  496. // Postconditions:
  497. // |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  498. func feSquare(
  499. h,
  500. f *fieldElement,
  501. ) {
  502. f0 := f[0]
  503. f1 := f[1]
  504. f2 := f[2]
  505. f3 := f[3]
  506. f4 := f[4]
  507. f5 := f[5]
  508. f6 := f[6]
  509. f7 := f[7]
  510. f8 := f[8]
  511. f9 := f[9]
  512. f0_2 := 2 * f0
  513. f1_2 := 2 * f1
  514. f2_2 := 2 * f2
  515. f3_2 := 2 * f3
  516. f4_2 := 2 * f4
  517. f5_2 := 2 * f5
  518. f6_2 := 2 * f6
  519. f7_2 := 2 * f7
  520. f5_38 := 38 * f5 // 1.31*2^30
  521. f6_19 := 19 * f6 // 1.31*2^30
  522. f7_38 := 38 * f7 // 1.31*2^30
  523. f8_19 := 19 * f8 // 1.31*2^30
  524. f9_38 := 38 * f9 // 1.31*2^30
  525. f0f0 := int64(f0) * int64(f0)
  526. f0f1_2 := int64(f0_2) * int64(f1)
  527. f0f2_2 := int64(f0_2) * int64(f2)
  528. f0f3_2 := int64(f0_2) * int64(f3)
  529. f0f4_2 := int64(f0_2) * int64(f4)
  530. f0f5_2 := int64(f0_2) * int64(f5)
  531. f0f6_2 := int64(f0_2) * int64(f6)
  532. f0f7_2 := int64(f0_2) * int64(f7)
  533. f0f8_2 := int64(f0_2) * int64(f8)
  534. f0f9_2 := int64(f0_2) * int64(f9)
  535. f1f1_2 := int64(f1_2) * int64(f1)
  536. f1f2_2 := int64(f1_2) * int64(f2)
  537. f1f3_4 := int64(f1_2) * int64(f3_2)
  538. f1f4_2 := int64(f1_2) * int64(f4)
  539. f1f5_4 := int64(f1_2) * int64(f5_2)
  540. f1f6_2 := int64(f1_2) * int64(f6)
  541. f1f7_4 := int64(f1_2) * int64(f7_2)
  542. f1f8_2 := int64(f1_2) * int64(f8)
  543. f1f9_76 := int64(f1_2) * int64(f9_38)
  544. f2f2 := int64(f2) * int64(f2)
  545. f2f3_2 := int64(f2_2) * int64(f3)
  546. f2f4_2 := int64(f2_2) * int64(f4)
  547. f2f5_2 := int64(f2_2) * int64(f5)
  548. f2f6_2 := int64(f2_2) * int64(f6)
  549. f2f7_2 := int64(f2_2) * int64(f7)
  550. f2f8_38 := int64(f2_2) * int64(f8_19)
  551. f2f9_38 := int64(f2) * int64(f9_38)
  552. f3f3_2 := int64(f3_2) * int64(f3)
  553. f3f4_2 := int64(f3_2) * int64(f4)
  554. f3f5_4 := int64(f3_2) * int64(f5_2)
  555. f3f6_2 := int64(f3_2) * int64(f6)
  556. f3f7_76 := int64(f3_2) * int64(f7_38)
  557. f3f8_38 := int64(f3_2) * int64(f8_19)
  558. f3f9_76 := int64(f3_2) * int64(f9_38)
  559. f4f4 := int64(f4) * int64(f4)
  560. f4f5_2 := int64(f4_2) * int64(f5)
  561. f4f6_38 := int64(f4_2) * int64(f6_19)
  562. f4f7_38 := int64(f4) * int64(f7_38)
  563. f4f8_38 := int64(f4_2) * int64(f8_19)
  564. f4f9_38 := int64(f4) * int64(f9_38)
  565. f5f5_38 := int64(f5) * int64(f5_38)
  566. f5f6_38 := int64(f5_2) * int64(f6_19)
  567. f5f7_76 := int64(f5_2) * int64(f7_38)
  568. f5f8_38 := int64(f5_2) * int64(f8_19)
  569. f5f9_76 := int64(f5_2) * int64(f9_38)
  570. f6f6_19 := int64(f6) * int64(f6_19)
  571. f6f7_38 := int64(f6) * int64(f7_38)
  572. f6f8_38 := int64(f6_2) * int64(f8_19)
  573. f6f9_38 := int64(f6) * int64(f9_38)
  574. f7f7_38 := int64(f7) * int64(f7_38)
  575. f7f8_38 := int64(f7_2) * int64(f8_19)
  576. f7f9_76 := int64(f7_2) * int64(f9_38)
  577. f8f8_19 := int64(f8) * int64(f8_19)
  578. f8f9_38 := int64(f8) * int64(f9_38)
  579. f9f9_38 := int64(f9) * int64(f9_38)
  580. h0 := f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38
  581. h1 := f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38
  582. h2 := f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19
  583. h3 := f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38
  584. h4 := f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38
  585. h5 := f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38
  586. h6 := f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19
  587. h7 := f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38
  588. h8 := f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38
  589. h9 := f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2
  590. var carry [10]int64
  591. carry[0] = (h0 + (1 << 25)) >> 26
  592. h1 += carry[0]
  593. h0 -= carry[0] << 26
  594. carry[4] = (h4 + (1 << 25)) >> 26
  595. h5 += carry[4]
  596. h4 -= carry[4] << 26
  597. carry[1] = (h1 + (1 << 24)) >> 25
  598. h2 += carry[1]
  599. h1 -= carry[1] << 25
  600. carry[5] = (h5 + (1 << 24)) >> 25
  601. h6 += carry[5]
  602. h5 -= carry[5] << 25
  603. carry[2] = (h2 + (1 << 25)) >> 26
  604. h3 += carry[2]
  605. h2 -= carry[2] << 26
  606. carry[6] = (h6 + (1 << 25)) >> 26
  607. h7 += carry[6]
  608. h6 -= carry[6] << 26
  609. carry[3] = (h3 + (1 << 24)) >> 25
  610. h4 += carry[3]
  611. h3 -= carry[3] << 25
  612. carry[7] = (h7 + (1 << 24)) >> 25
  613. h8 += carry[7]
  614. h7 -= carry[7] << 25
  615. carry[4] = (h4 + (1 << 25)) >> 26
  616. h5 += carry[4]
  617. h4 -= carry[4] << 26
  618. carry[8] = (h8 + (1 << 25)) >> 26
  619. h9 += carry[8]
  620. h8 -= carry[8] << 26
  621. carry[9] = (h9 + (1 << 24)) >> 25
  622. h0 += carry[9] * 19
  623. h9 -= carry[9] << 25
  624. carry[0] = (h0 + (1 << 25)) >> 26
  625. h1 += carry[0]
  626. h0 -= carry[0] << 26
  627. h[0] = int32(h0)
  628. h[1] = int32(h1)
  629. h[2] = int32(h2)
  630. h[3] = int32(h3)
  631. h[4] = int32(h4)
  632. h[5] = int32(h5)
  633. h[6] = int32(h6)
  634. h[7] = int32(h7)
  635. h[8] = int32(h8)
  636. h[9] = int32(h9)
  637. }
  638. // feMul121666 calculates h = f * 121666. Can overlap h with f.
  639. // Preconditions:
  640. // |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
  641. // Postconditions:
  642. // |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  643. func feMul121666(
  644. h,
  645. f *fieldElement,
  646. ) {
  647. h0 := int64(f[0]) * 121666
  648. h1 := int64(f[1]) * 121666
  649. h2 := int64(f[2]) * 121666
  650. h3 := int64(f[3]) * 121666
  651. h4 := int64(f[4]) * 121666
  652. h5 := int64(f[5]) * 121666
  653. h6 := int64(f[6]) * 121666
  654. h7 := int64(f[7]) * 121666
  655. h8 := int64(f[8]) * 121666
  656. h9 := int64(f[9]) * 121666
  657. var carry [10]int64
  658. carry[9] = (h9 + (1 << 24)) >> 25
  659. h0 += carry[9] * 19
  660. h9 -= carry[9] << 25
  661. carry[1] = (h1 + (1 << 24)) >> 25
  662. h2 += carry[1]
  663. h1 -= carry[1] << 25
  664. carry[3] = (h3 + (1 << 24)) >> 25
  665. h4 += carry[3]
  666. h3 -= carry[3] << 25
  667. carry[5] = (h5 + (1 << 24)) >> 25
  668. h6 += carry[5]
  669. h5 -= carry[5] << 25
  670. carry[7] = (h7 + (1 << 24)) >> 25
  671. h8 += carry[7]
  672. h7 -= carry[7] << 25
  673. carry[0] = (h0 + (1 << 25)) >> 26
  674. h1 += carry[0]
  675. h0 -= carry[0] << 26
  676. carry[2] = (h2 + (1 << 25)) >> 26
  677. h3 += carry[2]
  678. h2 -= carry[2] << 26
  679. carry[4] = (h4 + (1 << 25)) >> 26
  680. h5 += carry[4]
  681. h4 -= carry[4] << 26
  682. carry[6] = (h6 + (1 << 25)) >> 26
  683. h7 += carry[6]
  684. h6 -= carry[6] << 26
  685. carry[8] = (h8 + (1 << 25)) >> 26
  686. h9 += carry[8]
  687. h8 -= carry[8] << 26
  688. h[0] = int32(h0)
  689. h[1] = int32(h1)
  690. h[2] = int32(h2)
  691. h[3] = int32(h3)
  692. h[4] = int32(h4)
  693. h[5] = int32(h5)
  694. h[6] = int32(h6)
  695. h[7] = int32(h7)
  696. h[8] = int32(h8)
  697. h[9] = int32(h9)
  698. }
  699. func feInvert(
  700. dst,
  701. z *fieldElement,
  702. ) {
  703. var t0, t1, t2, t3 fieldElement
  704. var i int
  705. feSquare(
  706. &t0,
  707. z,
  708. ) // 2^1
  709. feSquare(
  710. &t1,
  711. &t0,
  712. ) // 2^2
  713. for i = 1; i < 2; i++ {
  714. feSquare(
  715. &t1,
  716. &t1,
  717. )
  718. } // 2^3
  719. feMul(
  720. &t1,
  721. z,
  722. &t1,
  723. ) // 2^3 + 2^0
  724. feMul(
  725. &t0,
  726. &t0,
  727. &t1,
  728. ) // 2^3 + 2^1 + 2^0
  729. feSquare(
  730. &t2,
  731. &t0,
  732. ) // 2^4 + 2^2 + 2^1
  733. feMul(
  734. &t1,
  735. &t1,
  736. &t2,
  737. ) // 2^4 + 2^3 + 2^2 + 2^1 + 2^0
  738. feSquare(
  739. &t2,
  740. &t1,
  741. ) // 5,4,3,2,1
  742. for i = 1; i < 5; i++ {
  743. feSquare(
  744. &t2,
  745. &t2,
  746. )
  747. } // 9,8,7,6,5
  748. feMul(
  749. &t1,
  750. &t2,
  751. &t1,
  752. ) // 9,8,7,6,5,4,3,2,1,0
  753. feSquare(
  754. &t2,
  755. &t1,
  756. ) // 10..1
  757. for i = 1; i < 10; i++ {
  758. feSquare(
  759. &t2,
  760. &t2,
  761. )
  762. } // 19..10
  763. feMul(
  764. &t2,
  765. &t2,
  766. &t1,
  767. ) // 19..0
  768. feSquare(
  769. &t3,
  770. &t2,
  771. ) // 20..1
  772. for i = 1; i < 20; i++ {
  773. feSquare(
  774. &t3,
  775. &t3,
  776. )
  777. } // 39..20
  778. feMul(
  779. &t2,
  780. &t3,
  781. &t2,
  782. ) // 39..0
  783. feSquare(
  784. &t2,
  785. &t2,
  786. ) // 40..1
  787. for i = 1; i < 10; i++ {
  788. feSquare(
  789. &t2,
  790. &t2,
  791. )
  792. } // 49..10
  793. feMul(
  794. &t1,
  795. &t2,
  796. &t1,
  797. ) // 49..0
  798. feSquare(
  799. &t2,
  800. &t1,
  801. ) // 50..1
  802. for i = 1; i < 50; i++ {
  803. feSquare(
  804. &t2,
  805. &t2,
  806. )
  807. } // 99..50
  808. feMul(
  809. &t2,
  810. &t2,
  811. &t1,
  812. ) // 99..0
  813. feSquare(
  814. &t3,
  815. &t2,
  816. ) // 100..1
  817. for i = 1; i < 100; i++ {
  818. feSquare(
  819. &t3,
  820. &t3,
  821. )
  822. } // 199..100
  823. feMul(
  824. &t2,
  825. &t3,
  826. &t2,
  827. ) // 199..0
  828. feSquare(
  829. &t2,
  830. &t2,
  831. ) // 200..1
  832. for i = 1; i < 50; i++ {
  833. feSquare(
  834. &t2,
  835. &t2,
  836. )
  837. } // 249..50
  838. feMul(
  839. &t1,
  840. &t2,
  841. &t1,
  842. ) // 249..0
  843. feSquare(
  844. &t1,
  845. &t1,
  846. ) // 250..1
  847. for i = 1; i < 5; i++ {
  848. feSquare(
  849. &t1,
  850. &t1,
  851. )
  852. } // 254..5
  853. feMul(
  854. dst,
  855. &t1,
  856. &t0,
  857. ) // 254..5,3,1,0
  858. }
  859. // OldScalarMultGeneric provides a platform-independent pure Go implementation
  860. // of OldScalarMult. It is used by default when a platform-specific optimized
  861. // version of OldScalarMult is not available. It is exported to provide
  862. // implementators an alternative if they encounter any trouble with an
  863. // optimized version and wish to call the pure Go implementation explicitly,
  864. // but should not be needed for normal use.
  865. func OldScalarMultGeneric(
  866. dst,
  867. scalar,
  868. point *[X25519Size]byte,
  869. ) error {
  870. var e [X25519Size]byte
  871. // Dubious to perform clamping at this stage,
  872. // *but*, behavior matches that of libsodium
  873. copy(
  874. e[:],
  875. scalar[:],
  876. )
  877. e[0] &= 248
  878. e[31] &= 127
  879. e[31] |= 64
  880. var x1, x2, z2, x3, z3, tmp0, tmp1 fieldElement
  881. feFromBytes(
  882. &x1,
  883. point,
  884. )
  885. feOne(
  886. &x2,
  887. )
  888. feCopy(
  889. &x3,
  890. &x1,
  891. )
  892. feOne(
  893. &z3,
  894. )
  895. swap := int32(0)
  896. for pos := 254; pos >= 0; pos-- {
  897. b := e[pos/8] >> uint(pos&7)
  898. b &= 1
  899. swap ^= int32(b)
  900. feCSwap(
  901. &x2,
  902. &x3,
  903. swap,
  904. )
  905. feCSwap(
  906. &z2,
  907. &z3,
  908. swap,
  909. )
  910. swap = int32(b)
  911. feSub(
  912. &tmp0,
  913. &x3,
  914. &z3,
  915. )
  916. feSub(
  917. &tmp1,
  918. &x2,
  919. &z2,
  920. )
  921. feAdd(
  922. &x2,
  923. &x2,
  924. &z2,
  925. )
  926. feAdd(
  927. &z2,
  928. &x3,
  929. &z3,
  930. )
  931. feMul(
  932. &z3,
  933. &tmp0,
  934. &x2,
  935. )
  936. feMul(
  937. &z2,
  938. &z2,
  939. &tmp1,
  940. )
  941. feSquare(
  942. &tmp0,
  943. &tmp1,
  944. )
  945. feSquare(
  946. &tmp1,
  947. &x2,
  948. )
  949. feAdd(
  950. &x3,
  951. &z3,
  952. &z2,
  953. )
  954. feSub(
  955. &z2,
  956. &z3,
  957. &z2,
  958. )
  959. feMul(
  960. &x2,
  961. &tmp1,
  962. &tmp0,
  963. )
  964. feSub(
  965. &tmp1,
  966. &tmp1,
  967. &tmp0,
  968. )
  969. feSquare(
  970. &z2,
  971. &z2,
  972. )
  973. feMul121666(
  974. &z3,
  975. &tmp1,
  976. )
  977. feSquare(
  978. &x3,
  979. &x3,
  980. )
  981. feAdd(
  982. &tmp0,
  983. &tmp0,
  984. &z3,
  985. )
  986. feMul(
  987. &z3,
  988. &x1,
  989. &z2,
  990. )
  991. feMul(
  992. &z2,
  993. &tmp1,
  994. &tmp0,
  995. )
  996. }
  997. feCSwap(
  998. &x2,
  999. &x3,
  1000. swap,
  1001. )
  1002. feCSwap(
  1003. &z2,
  1004. &z3,
  1005. swap,
  1006. )
  1007. feInvert(
  1008. &z2,
  1009. &z2,
  1010. )
  1011. feMul(
  1012. &x2,
  1013. &x2,
  1014. &z2,
  1015. )
  1016. feToBytes(
  1017. dst,
  1018. &x2,
  1019. )
  1020. return nil
  1021. }