draft-terriberry-netvc-codingtools.xml 51 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300
  1. <?xml version="1.0" encoding="utf-8"?>
  2. <!DOCTYPE rfc SYSTEM 'rfc2629.dtd'>
  3. <?rfc toc="yes" symrefs="yes" ?>
  4. <rfc ipr="trust200902" category="info" docName="draft-terriberry-netvc-codingtools-02">
  5. <front>
  6. <title abbrev="Coding Tools">Coding Tools for a Next Generation Video Codec</title>
  7. <author initials="T.B." surname="Terriberry" fullname="Timothy B. Terriberry">
  8. <organization>Mozilla Corporation</organization>
  9. <address>
  10. <postal>
  11. <street>331 E. Evelyn Avenue</street>
  12. <city>Mountain View</city>
  13. <region>CA</region>
  14. <code>94041</code>
  15. <country>USA</country>
  16. </postal>
  17. <phone>+1 650 903-0800</phone>
  18. <email>tterribe@xiph.org</email>
  19. </address>
  20. </author>
  21. <author initials="N.E." surname="Egge" fullname="Nathan E. Egge">
  22. <organization>Mozilla Corporation</organization>
  23. <address>
  24. <postal>
  25. <street>331 E. Evelyn Avenue</street>
  26. <city>Mountain View</city>
  27. <region>CA</region>
  28. <code>94041</code>
  29. <country>USA</country>
  30. </postal>
  31. <phone>+1 650 903-0800</phone>
  32. <email>negge@xiph.org</email>
  33. </address>
  34. </author>
  35. <date day="24" month="April" year="2017"/>
  36. <area>ART</area>
  37. <workgroup>netvc</workgroup>
  38. <abstract>
  39. <t>
  40. This document proposes a number of coding tools that could be incorporated into
  41. a next-generation video codec.
  42. </t>
  43. </abstract>
  44. </front>
  45. <middle>
  46. <section anchor="intro" title="Introduction">
  47. <t>
  48. One of the biggest contributing factors to the success of the Internet is that
  49. the underlying protocols are implementable on a royalty-free basis.
  50. This allows them to be implemented widely and easily distributed by application
  51. developers, service operators, and end users, without asking for permission.
  52. In order to produce a next-generation video codec that is competitive with the
  53. best patent-encumbered standards, yet avoids patents which are not available
  54. on an open-source compatible, royalty-free basis, we must use old coding tools
  55. in new ways and develop new coding tools.
  56. This draft documents some of the tools we have been working on for inclusion in
  57. such a codec.
  58. This is early work, and the performance of some of these tools (especially in
  59. relation to other approaches) is not yet fully known.
  60. Nevertheless, it still serves to outline some possibilities that NETVC could
  61. consider.
  62. </t>
  63. </section>
  64. <section anchor="entropy_coding" title="Entropy Coding">
  65. <t>
  66. The basic theory of entropy coding was well-established by the late
  67. 1970's&nbsp;<xref target="Pas76"/>.
  68. Modern video codecs have focused on Huffman codes (or "Variable-Length
  69. Codes"/VLCs) and binary arithmetic coding.
  70. Huffman codes are limited in the amount of compression they can provide and the
  71. design flexibility they allow, but as each code word consists of an integer
  72. number of bits, their implementation complexity is very low, so they were
  73. provided at least as an option in every video codec up through H.264.
  74. Arithmetic coding, on the other hand, uses code words that can take up
  75. fractional parts of a bit, and are more complex to implement.
  76. However, the prevalence of cheap, H.264 High Profile hardware, which requires
  77. support for arithmetic coding, shows that it is no longer so expensive that a
  78. fallback VLC-based approach is required.
  79. Having a single entropy-coding method simplifies both up-front design costs and
  80. interoperability.
  81. </t>
  82. <t>
  83. However, the primary limitation of arithmetic coding is that it is an
  84. inherently serial operation.
  85. A given symbol cannot be decoded until the previous symbol is decoded, because
  86. the bits (if any) that are output depend on the exact state of the decoder at
  87. the time it is decoded.
  88. This means that a hardware implementation must run at a sufficiently high clock
  89. rate to be able to decode all of the symbols in a frame.
  90. Higher clock rates lead to increased power consumption, and in some cases the
  91. entropy coding is actually becoming the limiting factor in these designs.
  92. </t>
  93. <t>
  94. As fabrication processes improve, implementers are very willing to trade
  95. increased gate count for lower clock speeds.
  96. So far, most approaches to allowing parallel entropy coding have focused on
  97. splitting the encoded symbols into multiple streams that can be decoded
  98. independently.
  99. This "independence" requirement has a non-negligible impact on compression,
  100. parallelizability, or both.
  101. For example, H.264 can split frames into "slices" which might cover only a
  102. small subset of the blocks in the frame.
  103. In order to allow decoding these slices independently, they cannot use context
  104. information from blocks in other slices (harming compression).
  105. Those contexts must adapt rapidly to account for the generally small number of
  106. symbols available for learning probabilities (also harming compression).
  107. In some cases the number of contexts must be reduced to ensure enough symbols
  108. are coded in each context to usefully learn probabilities at all (once more,
  109. harming compression).
  110. Furthermore, an encoder must specially format the stream to use multiple slices
  111. per frame to allow any parallel entropy decoding at all.
  112. Encoders rarely have enough information to evaluate this "compression
  113. efficiency" vs. "parallelizability" trade-off, since they don't generally know
  114. the limitations of the decoders for which they are encoding.
  115. That means there will be many files or streams which could have been decoded if
  116. they were encoded with different options, but which a given decoder cannot
  117. decode because of bad choices made by the encoder (at least from the
  118. perspective of that decoder).
  119. The same set of drawbacks apply to the DCT token partitions in
  120. VP8&nbsp;<xref target="RFC6386"/>.
  121. </t>
  122. <section anchor="nonbinary_coding" title="Non-binary Arithmetic Coding">
  123. <t>
  124. Instead, we propose a very different approach: use non-binary arithmetic
  125. coding.
  126. In binary arithmetic coding, each decoded symbol has one of two possible
  127. values: 0 or 1.
  128. The original arithmetic coding algorithms allow a symbol to take on any number
  129. of possible values, and allow the size of that alphabet to change with each
  130. symbol coded.
  131. Reasonable values of N (for example, N&nbsp;&lt;=&nbsp;16) offer the potential
  132. for a decent throughput increase for a reasonable increase in gate count for
  133. hardware implementations.
  134. </t>
  135. <t>
  136. Binary coding allows a number of computational simplifications.
  137. For example, for each coded symbol, the set of valid code points is partitioned
  138. in two, and the decoded value is determined by finding the partition in which
  139. the actual code point that was received lies.
  140. This can be determined by computing a single partition value (in both the
  141. encoder and decoder) and (in the decoder) doing a single comparison.
  142. A non-binary arithmetic coder partitions the set of valid code points
  143. into multiple pieces (one for each possible value of the coded symbol).
  144. This requires the encoder to compute two partition values, in general (for both
  145. the upper and lower bound of the symbol to encode).
  146. The decoder, on the other hand, must search the partitions for the one that
  147. contains the received code point.
  148. This requires computing at least O(log&nbsp;N) partition values.
  149. </t>
  150. <t>
  151. However, coding a parameter with N possible values with a binary arithmetic
  152. coder requires O(log&nbsp;N) symbols in the worst case (the only case
  153. that matters for hardware design).
  154. Hence, this does not represent any actual savings (indeed, it represents an
  155. increase in the number of partition values computed by the encoder).
  156. In addition, there are a number of overheads that are per-symbol, rather than
  157. per-value.
  158. For example, renormalization (which enlarges the set of valid code points after
  159. partitioning has reduced it too much), carry propagation (to deal with the
  160. case where the high and low ends of a partition straddle a bit boundary),
  161. etc., are all performed on a symbol-by-symbol basis.
  162. Since a non-binary arithmetic coder codes a given set of values with fewer
  163. symbols than a binary one, it incurs these per-symbol overheads less often.
  164. This suggests that a non-binary arithmetic coder can actually be more efficient
  165. than a binary one.
  166. </t>
  167. </section>
  168. <section anchor="nonbinary_modeling" title="Non-binary Context Modeling">
  169. <t>
  170. The other aspect that binary coding simplifies is probability modeling.
  171. In arithmetic coding, the size of the sets the code points are partitioned into
  172. are (roughly) proportional to the probability of each possible symbol value.
  173. Estimating these probabilities is part of the coding process, though it can be
  174. cleanly separated from the task of actually producing the coded bits.
  175. In a binary arithmetic coder, this requires estimating the probability of only
  176. one of the two possible values (since the total probability is 1.0).
  177. This is often done with a simple table lookup that maps the old probability and
  178. the most recently decoded symbol to a new probability to use for the next
  179. symbol in the current context.
  180. The trade-off, of course, is that non-binary symbols must be "binarized" into
  181. a series of bits, and a context (with an associated probability) chosen for
  182. each one.
  183. </t>
  184. <t>
  185. In a non-binary arithmetic coder, the decoder must compute at least
  186. O(log&nbsp;N) cumulative probabilities (one for each partition value it
  187. needs).
  188. Because these probabilities are usually not estimated directly in "cumulative"
  189. form, this can require computing (N&nbsp;-&nbsp;1) non-cumulative probability
  190. values.
  191. Unless N is very small, these cannot be updated with a single table lookup.
  192. The normal approach is to use "frequency counts".
  193. Define the frequency of value k to be
  194. <figure align="center">
  195. <artwork align="center"><![CDATA[
  196. f[k] = A*<the number of times k has been observed> + B
  197. ]]></artwork>
  198. </figure>
  199. where A and B are parameters (usually A=2 and B=1 for a traditional
  200. Krichevsky-Trofimov estimator).
  201. The resulting probability, p[k], is given by
  202. <figure align="center">
  203. <artwork align="center"><![CDATA[
  204. N-1
  205. __
  206. ft = \ f[k]
  207. /_
  208. k=0
  209. f[k]
  210. p[k] = ----
  211. ft
  212. ]]></artwork>
  213. </figure>
  214. When ft grows too large, the frequencies are rescaled (e.g., halved, rounding
  215. up to prevent reduction of a probability to 0).
  216. </t>
  217. <t>
  218. When ft is not a power of two, partitioning the code points requires actual
  219. divisions (see <xref target="RFC6716"/> Section&nbsp;4.1 for one detailed
  220. example of exactly how this is done).
  221. These divisions are acceptable in an audio codec like
  222. Opus&nbsp;<xref target="RFC6716"/>, which only has to code a few hundreds of
  223. these symbols per second.
  224. But video requires hundreds of thousands of symbols per second, at a minimum,
  225. and divisions are still very expensive to implement in hardware.
  226. </t>
  227. <t>
  228. There are two possible approaches to this.
  229. One is to come up with a replacement for frequency counts that produces
  230. probabilities that sum to a power of two.
  231. Some possibilities, which can be applied individually or in combination:
  232. <list style="numbers">
  233. <t>
  234. Use probabilities that are fixed for the duration of a frame.
  235. This is the approach taken by VP8, for example, even though it uses a binary
  236. arithmetic coder.
  237. In fact, it is possible to convert many of VP8's existing binary-alphabet
  238. probabilities into probabilities for non-binary alphabets, an approach that is
  239. used in the experiment presented at the end of this section.
  240. </t>
  241. <t>
  242. Use parametric distributions.
  243. For example, DCT coefficient magnitudes usually have an approximately
  244. exponential distribution.
  245. This distribution can be characterized by a single parameter, e.g., the
  246. expected value.
  247. The expected value is trivial to update after decoding a coefficient.
  248. For example
  249. <figure align="center">
  250. <artwork align="center"><![CDATA[
  251. E[x[n+1]] = E[x[n]] + floor(C*(x[n] - E[x[n]]))
  252. ]]></artwork>
  253. </figure>
  254. produces an exponential moving average with a decay factor of
  255. (1&nbsp;-&nbsp;C).
  256. For a choice of C that is a negative power of two (e.g., 1/16 or 1/32 or
  257. similar), this can be implemented with two adds and a shift.
  258. Given this expected value, the actual distribution to use can be obtained from
  259. a small set of pre-computed distributions via a lookup table.
  260. Linear interpolation between these pre-computed values can improve accuracy, at
  261. the cost of O(N) computations, but if N is kept small this is trivially
  262. parallelizable, in SIMD or otherwise.
  263. </t>
  264. <t>
  265. Change the frequency count update mechanism so that ft is constant.
  266. This approach is described in the next section.
  267. </t>
  268. </list>
  269. </t>
  270. </section>
  271. <section anchor="dyadic_adaptation" title="Dyadic Adaptation">
  272. <t>
  273. The goal with context adaptation using dyadic probabilities is to maintain
  274. the invariant that the probabilities all sum to a power of two before and
  275. after adaptation.
  276. This can be achieved with a special update function that blends the cumulative
  277. probabilities of the current context with a cumulative distribution function
  278. where the coded symbol has probability 1.
  279. </t>
  280. <t>
  281. Suppose we have model for a given context that codes 8 symbols with the
  282. following probabilities:
  283. <figure align="center">
  284. <artwork align="center"><![CDATA[
  285. +------+------+------+------+------+------+------+------+
  286. | p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7] |
  287. +------+------+------+------+------+------+------+------+
  288. | 1/8 | 1/8 | 3/16 | 1/16 | 1/16 | 3/16 | 1/8 | 1/8 |
  289. +------+------+------+------+------+------+------+------+
  290. ]]></artwork>
  291. </figure>
  292. Then the cumulative distribution function is:
  293. <figure align="center">
  294. <artwork align="left"><![CDATA[
  295. CDF
  296. 1 + +------+
  297. | |
  298. | +------+
  299. | |
  300. 3/4 + +------+
  301. | |
  302. | |
  303. | +------+
  304. 1/2 + +------+
  305. | +------+
  306. | |
  307. | |
  308. 1/4 + +------+
  309. | |
  310. +------+
  311. |
  312. 0 +------+------+------+------+------+------+------+------+ Bin
  313. fl[1] fl[2] fl[3] fl[4] fl[5] fl[6] fl[7] fl[8]
  314. ]]></artwork>
  315. </figure>
  316. Suppose we code symbol 3 and wish to update the context model so that this
  317. symbol is now more likely.
  318. This can be done by blending the CDF for the current context with a CDF
  319. that has symbol 3 with likelihood 1.
  320. <figure align="center">
  321. <artwork align="left"><![CDATA[
  322. CDF
  323. 1 + +----------------------------------+
  324. | |
  325. | |
  326. | |
  327. 0 +------+------+------+------+------+------+------+------+ Bin
  328. fl[1] fl[2] fl[3] fl[4] fl[5] fl[6] fl[7] fl[8]
  329. ]]></artwork>
  330. </figure>
  331. Given an adaptation rate g between 0 and 1, and assuming ft = 2^4 = 16, what
  332. we are computing is:
  333. <figure align="center">
  334. <artwork align="center"><![CDATA[
  335. +------+------+------+------+------+------+------+------+
  336. | 2 | 4 | 7 | 8 | 9 | 12 | 14 | 16 | * (1 - g)
  337. +------+------+------+------+------+------+------+------+
  338. +
  339. +------+------+------+------+------+------+------+------+
  340. | 0 | 0 | 0 | 16 | 16 | 16 | 16 | 16 | * g
  341. +------+------+------+------+------+------+------+------+
  342. ]]></artwork>
  343. </figure>
  344. In order to prevent the probability of any one symbol from going to zero, the
  345. blending functions above and below the coded symbol are adjusted so that no
  346. adjacent cumulative probabilities are the same.
  347. </t>
  348. <t>
  349. Let M be the alphabet size and 1/2^r be the adaptation rate:
  350. </t>
  351. <t>
  352. <figure align="center">
  353. <artwork align="center"><![CDATA[
  354. ( fl[i] - floor((fl[i] + 2^r - i - 1)/2^r), i <= coded symbol
  355. fl[i] = <
  356. ( fl[i] - floor((fl[i] + M - i - ft)/2^r), i > coded symbol
  357. ]]></artwork>
  358. </figure>
  359. Applying these formulas to the example CDF where M = 8 with adaptation rate
  360. 1/2^16 gives the updated CDF:
  361. <figure align="center">
  362. <artwork align="center"><![CDATA[
  363. +------+------+------+------+------+------+------+------+
  364. | 1 | 3 | 6 | 9 | 10 | 13 | 15 | 16 |
  365. +------+------+------+------+------+------+------+------+
  366. ]]></artwork>
  367. </figure>
  368. Looking at the graph of the CDF we see that the likelihood for symbol 3
  369. has gone up from 1/16 to 3/16, dropping the likelihood of all other symbols
  370. to make room.
  371. <figure align="center">
  372. <artwork align="left"><![CDATA[
  373. CDF
  374. 1 + +------+
  375. | +------+
  376. | |
  377. | +------+
  378. 3/4 + |
  379. | |
  380. | +------+
  381. | +------+
  382. 1/2 + |
  383. | |
  384. | +------+
  385. | |
  386. 1/4 + |
  387. | +------+
  388. | |
  389. +------+
  390. 0 +------+------+------+------+------+------+------+------+ Bin
  391. fl[1] fl[2] fl[3] fl[4] fl[5] fl[6] fl[7] fl[8]
  392. ]]></artwork>
  393. </figure>
  394. </t>
  395. </section>
  396. <section title="Simplified Partition Function">
  397. <t>
  398. Let the range of valid code points in the current arithmetic coder state be
  399. [L,&nbsp;L&nbsp;+&nbsp;R), where L is the lower bound of the range and R is
  400. the number of valid code points.
  401. The goal of the arithmetic coder is to partition this interval proportional to
  402. the probability of each symbol.
  403. When using dyadic probabilities, the partition point in the range corresponding
  404. to a given CDF value can be determined via
  405. </t>
  406. <figure align="center">
  407. <artwork align="center"><![CDATA[
  408. fl[k]*R
  409. u[k] = floor ( ------- )
  410. ft
  411. ]]></artwork>
  412. </figure>
  413. <t>
  414. Since ft is a power of two, this may be implemented using a right shift by T
  415. bits in place of the division:
  416. </t>
  417. <figure align="center">
  418. <artwork align="center"><![CDATA[
  419. u[k] = (fl[k]*R) >> T
  420. ]]></artwork>
  421. </figure>
  422. <t>
  423. The latency of the multiply still dominates the hardware timing.
  424. However, we can reduce this latency by using a smaller multiply, at the cost of
  425. some accuracy in the partition.
  426. We cannot, in general, reduce the size of fl[k], since this might send a
  427. probability to zero (i.e., cause u[k] to have the same value as u[k+1]).
  428. On the other hand, we know that the top bit of R is always 1, since it gets
  429. renormalized with every symbol that is encoded.
  430. Suppose R contains 16 bits and that T is at least 8.
  431. Then we can greatly reduce the size of the multiply by using the formula
  432. </t>
  433. <figure align="center">
  434. <artwork align="center"><![CDATA[
  435. ( (fl[k]*(R >> 8)) >> (T - 8), 0 <= k < M
  436. u[k] = <
  437. ( R, k == M
  438. ]]></artwork>
  439. </figure>
  440. <t>
  441. The special case for k&nbsp;==&nbsp;M is required because, with the general
  442. formula, u[M] no longer exactly equals R.
  443. Without the special case we would waste some amount of code space and require
  444. the decoder to check for invalid streams.
  445. This special case slightly inflates the probability of the last symbol.
  446. Unfortunately, in codecs the usual convention is that the last symbol is the
  447. least probable, while the first symbol (e.g., 0) is the most probable.
  448. That maximizes the coding overhead introduced by this approximation error.
  449. To minimize it, we instead add all of the accumulated error to the first symbol
  450. by using a variation of the above update formula:
  451. </t>
  452. <figure align="center">
  453. <artwork align="center"><![CDATA[
  454. ( 0, k == 0
  455. u[k] = <
  456. ( R - (((ft - fl[k])*(R >> 8)) >> (T - 8)), 0 < k <= M
  457. ]]></artwork>
  458. </figure>
  459. <t>
  460. This also aids the software decoder search, since it can prime the search loop
  461. with the special case, instead of needing to check for it on every iteration
  462. of the loop.
  463. It is easier to incorporate into a SIMD search as well.
  464. It does, however, add two subtractions.
  465. Since the encoder always operates on the difference between two partition
  466. points, the first subtraction (involving R) can be eliminated.
  467. Similar optimizations can eliminate this subtraction in the decoder by flipping
  468. its internal state (measuring the distance of the encoder output from the top
  469. of the range instead of the bottom).
  470. To avoid the other subtraction, we can simply use "inverse CDFs" that natively
  471. store ifl[k]&nbsp;=&nbsp;(ft&nbsp;-&nbsp;fl[k]) instead of fl[k].
  472. This produces the following partition function:
  473. </t>
  474. <figure align="center">
  475. <artwork align="center"><![CDATA[
  476. ( R, k == 0
  477. R - u[k] = <
  478. ( (ifl[k]*(R >> 8)) >> (T - 8), 0 < k <= M
  479. ]]></artwork>
  480. </figure>
  481. <t>
  482. The reduction in hardware latency can be as much as 20%, and the impact on area
  483. is even larger.
  484. The overall software complexity overhead is minimal, and the coding efficiency
  485. overhead due to the approximation is about 0.02%.
  486. We could have achieved the same efficiency by leaving the special case on the
  487. last symbol and reversing the alphabet instead of inverting the probabilities.
  488. However, reversing the alphabet at runtime would have required an extra
  489. subtraction (or more general re-ordering requires a table lookup).
  490. That may be avoidable in some cases, but only by propagating the reordering
  491. alphabet outside of the entropy coding machinery, requiring changes to every
  492. coding tool and potentially leading to confusion.
  493. CDFs, on the other hand, are already a somewhat abstract representation of the
  494. underlying probabilities used for computational efficiency reasons.
  495. Generalizing these to "inverse CDFs" is a straightforward change that only
  496. affects probability initialization and adaptation, without impacting the
  497. design of other coding tools.
  498. </t>
  499. </section>
  500. <section title="Context Adaptation">
  501. <t>
  502. The dyadic adaptation scheme described in&nbsp;<xref target="dyadic_adaptation"/>
  503. implements a low-complexity IIR filter for the steady-state case where we only
  504. want to adapt the context CDF as fast as the 1/2^r adaptation rate.
  505. In many cases, for example when coding symbols at the start of a video frame, only
  506. a limited number of symbols have been seen per context.
  507. Using this steady-state adaptation scheme risks adapting too slowly and spending
  508. too many bits to code symbols with incorrect probability estimates.
  509. In other video codecs, this problem is reduced by either implicitly or explicitly
  510. allowing for mechanisms to set the initial probability models for a given
  511. context.
  512. </t>
  513. <section title="Implicit Adaptation">
  514. <t>
  515. One implicit way to use default probabilities is to simply require as a
  516. normative part of the decoder that some specific CDFs are used to initialize
  517. each context.
  518. A representative set of inputs is run through the encoder and a frequency based
  519. probability model is computed and reloaded at the start of every frame.
  520. This has the advantage of having zero bitstream overhead and is optimal for
  521. certain stationary symbols.
  522. However for other non-stationary symbols, or highly content dependent contexts
  523. where the sample input is not representative, this can be worse than starting
  524. with a flat distribution as it now takes even longer to adapt to the
  525. steady-state.
  526. Moreover the amount of hardware area required to store initial probability
  527. tables for each context goes up with the number of contexts in the codec.
  528. </t>
  529. <t>
  530. Another implicit way to deal with poor initial probabilities is through backward
  531. adaptation based on the probability estimates from the previous frame.
  532. After decoding a frame, the adapted CDFs for each context are simply kept as-is
  533. and not reset to their defaults.
  534. This has the advantage of having no bitstream overhead, and tracking to certain
  535. content types closely as we expect frames with similar content at similar rates,
  536. to have well correlated CDFs.
  537. However, this only works when we know there will be no bitstream errors due to
  538. the transport layer, e.g., TCP or HTTP.
  539. In low delay use cases (video on demand, live streaming, video conferencing),
  540. implicit backwards adaptation is avoided as it risks desynchronizing the
  541. entropy decoder state and permanently losing the video stream.
  542. </t>
  543. </section>
  544. <section title="Explicit Adaptation">
  545. <t>
  546. For codecs that include the ability to update the probability models in the
  547. bitstream, it is possible to explicitly signal a starting CDF.
  548. The previously described implicit backwards adaptation is now possible by
  549. simply explicitly coding a probability update for each frame.
  550. However, the cost of signaling the updated CDF must be overcome by the
  551. savings from coding with the updated CDF.
  552. Blindly updating all contexts per frame may work at high rates where the size
  553. of the CDFs is small relative to the coded symbol data.
  554. However at low rates, the benefit of using more accurate CDFs is quickly
  555. overcome by the cost of coding them, which increases with the number of
  556. contexts.
  557. </t>
  558. <t>
  559. More sophisticated encoders can compute the cost of coding a probability update
  560. for a given context, and compare it to the size reduction achieved by coding
  561. symbols with this context.
  562. Here all symbols for a given frame (or tile) are buffered and not serialized by
  563. the entropy coder until the end of the frame (or tile) is reached.
  564. Once the end of the entropy segment has been reached, the cost in bits for
  565. coding symbols with both the default probabilities and the proposed updated
  566. probabilities can be measured and compared.
  567. However, note that with the symbols already buffered, rather than consider the
  568. context probabilities from the previous frame, a simple frequency based
  569. probability model can be computed and measured.
  570. Because this probability model is computed based on the symbols we are about
  571. to code this technique is called forward adaptation.
  572. If the cost in bits to signal and code with this new probability model is less
  573. than that of using the default then it is used.
  574. This has the advantage of only ever coding a probability update if it is an
  575. improvement and producing a bitstream that is robust to errors, but
  576. requires an entire entropy segments worth of symbols be cached.
  577. </t>
  578. </section>
  579. <section anchor="early_adaptation" title="Early Adaptation">
  580. <t>
  581. We would like to take advantage of the low-cost multi-symbol CDF adaptation
  582. described in&nbsp;<xref target="dyadic_adaptation"/> without in the broadest set
  583. of use cases.
  584. This means the initial probability adaptation scheme should support low-delay,
  585. error-resilient streams that efficiently implemented in both hardware and
  586. software.
  587. We propose an early adaptation scheme that supports this goal.
  588. </t>
  589. <t>
  590. At the beginning of a frame (or tile), all CDFs are initialized to a flat
  591. distribution.
  592. For a given multi-symbol context with M potential symbols, assume that the
  593. initial dyadic CDF is initialized so that each symbol has probability 1/M.
  594. For the first M coded symbols, the CDF is updated as follows:
  595. <figure align="center">
  596. <artwork align="center"><![CDATA[
  597. a[c,M] = ft/(M + c)
  598. ( fl[i] - floor((fl[i] - i)*a/ft), i <= coded symbol
  599. fl[i] = <
  600. ( fl[i] - floor((fl[i] + M - i - ft)*a/ft), i > coded symbol
  601. ]]></artwork>
  602. </figure>
  603. where c goes from 0 to M-1 and is the running count of the number of symbols
  604. coded with this CDF.
  605. Note that for a fixed CDF precision (ft is always a power of two) and a
  606. maximum number of possible symbols M, the values of a[c,M] can be stored
  607. in a M*(M+1)/2 element table, which is 136 entries when M = 16.
  608. </t>
  609. </section>
  610. </section>
  611. <section anchor="entropy_experiment" title="Simple Experiment">
  612. <t>
  613. As a simple experiment to validate the non-binary approach, we compared a
  614. non-binary arithmetic coder to the VP8 (binary) entropy coder.
  615. This was done by instrumenting vp8_treed_read() in libvpx to dump out the
  616. symbol decoded and the associated probabilities used to decode it.
  617. This data only includes macroblock mode and motion vector information, as the
  618. DCT token data is decoded with custom inline functions, and not
  619. vp8_treed_read().
  620. This data is available at
  621. <eref target="https://people.xiph.org/~tterribe/daala/ec_test0/ec_tokens.txt"/>.
  622. It includes 1,019,670&nbsp;values encode using 2,125,995&nbsp;binary symbols
  623. (or 2.08&nbsp;symbols per value).
  624. We expect that with a conscious effort to group symbols during the codec
  625. design, this average could easily be increased.
  626. </t>
  627. <t>
  628. We then implemented both the regular VP8 entropy decoder (in plain C, using all
  629. of the optimizations available in libvpx at the time) and a multisymbol
  630. entropy decoder (also in plain C, using similar optimizations), which encodes
  631. each value with a single symbol.
  632. For the decoder partition search in the non-binary decoder, we used a simple
  633. for loop (O(N) worst-case), even though this could be made constant-time and
  634. branchless with a few SIMD instructions such as (on x86) PCMPGTW, PACKUSWB,
  635. and PMOVMASKB followed by BSR.
  636. The source code for both implementations is available at
  637. <eref target="https://people.xiph.org/~tterribe/daala/ec_test0/ec_test.c"/>
  638. (compile with -DEC_BINARY for the binary version and -DEC_MULTISYM for the
  639. non-binary version).
  640. </t>
  641. <t>
  642. The test simply loads the tokens, and then loops 1024 times encoding them using
  643. the probabilities provided, and then decoding them.
  644. The loop was added to reduce the impact of the overhead of loading the data,
  645. which is implemented very inefficiently.
  646. The total runtime on a Core i7 from 2010 is 53.735&nbsp;seconds for the binary
  647. version, and 27.937&nbsp;seconds for the non-binary version, or a 1.92x
  648. improvement.
  649. This is very nearly equal to the number of symbols per value in the binary
  650. coder, suggesting that the per-symbol overheads account for the vast majority
  651. of the computation time in this implementation.
  652. </t>
  653. </section>
  654. </section>
  655. <section anchor="reversible_integer_transforms"
  656. title="Reversible Integer Transforms">
  657. <t>
  658. Integer transforms in image and video coding date back to at least
  659. 1969&nbsp;<xref target="PKA69"/>.
  660. Although standards such as MPEG2 and MPEG4 Part&nbsp;2 allow some flexibility
  661. in the transform implementation, implementations were subject to drift and
  662. error accumulation, and encoders had to impose special macroblock refresh
  663. requirements to avoid these problems, not always successfully.
  664. As transforms in modern codecs only account for on the order of 10% of the
  665. total decoder complexity, and, with the use of weighted prediction with gains
  666. greater than unity and intra prediction, are far more susceptible to drift and
  667. error accumulation, it no longer makes sense to allow a non-exact transform
  668. specification.
  669. </t>
  670. <t>
  671. However, it is also possible to make such transforms "reversible", in the sense
  672. that applying the inverse transform to the result of the forward transform
  673. gives back the original input values, exactly.
  674. This gives a lossy codec, which normally quantizes the coefficients before
  675. feeding them into the inverse transform, the ability to scale all the way to
  676. lossless compression without requiring any new coding tools.
  677. This approach has been used successfully by JPEG XR, for
  678. example&nbsp;<xref target="TSSRM08"/>.
  679. </t>
  680. <t>
  681. Such reversible transforms can be constructed using "lifting steps", a series
  682. of shear operations that can represent any set of plane rotations, and thus
  683. any orthogonal transform.
  684. This approach dates back to at least 1992&nbsp;<xref target="BE92"/>, which
  685. used it to implement a four-point 1-D Discrete Cosine Transform (DCT).
  686. Their implementation requires 6&nbsp;multiplications, 10&nbsp;additions,
  687. 2&nbsp;shifts, and 2&nbsp;negations, and produces output that is a factor of
  688. sqrt(2) larger than the orthonormal version of the transform.
  689. The expansion of the dynamic range directly translates into more bits to code
  690. for lossless compression.
  691. Because the least significant bits are usually very nearly random noise, this
  692. scaling increases the coding cost by approximately half a bit per sample.
  693. </t>
  694. <section anchor="lifting_steps" title="Lifting Steps">
  695. <t>
  696. To demonstrate the idea of lifting steps, consider the two-point transform
  697. <figure align="center">
  698. <artwork align="center"><![CDATA[
  699. ___
  700. [ y0 ] / 1 [ 1 1 ] [ x0 ]
  701. [ ] = / --- [ ] [ ]
  702. [ y1 ] v 2 [ -1 1 ] [ x1 ]
  703. ]]></artwork>
  704. </figure>
  705. This can be implemented up to scale via
  706. <figure align="center">
  707. <artwork align="center"><![CDATA[
  708. y0 = x0 + x1
  709. y1 = 2*x1 - y0
  710. ]]></artwork>
  711. </figure>
  712. and reversed via
  713. <figure align="center">
  714. <artwork align="center"><![CDATA[
  715. x1 = (y0 + y1) >> 1
  716. x0 = y0 - x1
  717. ]]></artwork>
  718. </figure>
  719. </t>
  720. <t>
  721. Both y0 and y1 are too large by a factor of sqrt(2), however.
  722. </t>
  723. <t>
  724. It is also possible to implement any rotation by an angle t, including the
  725. orthonormal scale factor, by decomposing it into three steps:
  726. <figure align="center">
  727. <artwork align="center"><![CDATA[
  728. cos(t) - 1
  729. u0 = x0 + ---------- * x1
  730. sin(t)
  731. y1 = x1 + sin(t)*u0
  732. cos(t) - 1
  733. y0 = u0 + ---------- * y1
  734. sin(t)
  735. ]]></artwork>
  736. </figure>
  737. By letting t=-pi/4, we get an implementation of the first transform that
  738. includes the scaling factor.
  739. To get an integer approximation of this transform, we need only replace the
  740. transcendental constants by fixed-point approximations:
  741. <figure align="center">
  742. <artwork align="center"><![CDATA[
  743. u0 = x0 + ((27*x1 + 32) >> 6)
  744. y1 = x1 - ((45*u0 + 32) >> 6)
  745. y0 = u0 + ((27*y1 + 32) >> 6)
  746. ]]></artwork>
  747. </figure>
  748. This approximation is still perfectly reversible:
  749. <figure align="center">
  750. <artwork align="center"><![CDATA[
  751. u0 = y0 - ((27*y1 + 32) >> 6)
  752. x1 = y1 + ((45*u0 + 32) >> 6)
  753. x0 = u0 - ((27*x1 + 32) >> 6)
  754. ]]></artwork>
  755. </figure>
  756. Each of the three steps can be implemented using just two ARM instructions,
  757. with constants that have up to 14&nbsp;bits of precision (though using fewer
  758. bits allows more efficient hardware implementations, at a small cost in coding
  759. gain).
  760. However, it is still much more complex than the first approach.
  761. </t>
  762. <t>
  763. We can get a compromise with a slight modification:
  764. <figure align="center">
  765. <artwork align="center"><![CDATA[
  766. y0 = x0 + x1
  767. y1 = x1 - (y0 >> 1)
  768. ]]></artwork>
  769. </figure>
  770. This still only implements the original orthonormal transform up to scale.
  771. The y0 coefficient is too large by a factor of sqrt(2) as before, but y1 is now
  772. too small by a factor of sqrt(2).
  773. If our goal is simply to (optionally quantize) and code the result, this is
  774. good enough.
  775. The different scale factors can be incorporated into the quantization matrix in
  776. the lossy case, and the total expansion is roughly equivalent to that of the
  777. orthonormal transform in the lossless case.
  778. Plus, we can perform each step with just one ARM instruction.
  779. </t>
  780. <t>
  781. However, if instead we want to apply additional transformations to the data, or
  782. use the result to predict other data, it becomes much more convenient to have
  783. uniformly scaled outputs.
  784. For a two-point transform, there is little we can do to improve on the
  785. three-multiplications approach above.
  786. However, for a four-point transform, we can use the last approach and arrange
  787. multiple transform stages such that the "too large" and "too small" scaling
  788. factors cancel out, producing a result that has the true, uniform, orthonormal
  789. scaling.
  790. To do this, we need one more tool, which implements the following transform:
  791. <figure align="center">
  792. <artwork align="center"><![CDATA[
  793. ___
  794. [ y0 ] / 1 [ cos(t) -sin(t) ] [ 1 0 ] [ x0 ]
  795. [ ] = / --- [ ] [ ] [ ]
  796. [ y1 ] v 2 [ sin(t) cos(t) ] [ 0 2 ] [ x1 ]
  797. ]]></artwork>
  798. </figure>
  799. This takes unevenly scaled inputs, rescales them, and then rotates them.
  800. Like an ordinary rotation, it can be reduced to three lifting steps:
  801. <figure align="center">
  802. <artwork align="center"><![CDATA[
  803. _
  804. 2*cos(t) - v2
  805. u0 = x0 + ------------- * x1
  806. sin(t)
  807. ___
  808. / 1
  809. y1 = x1 + / --- * sin(t)*u0
  810. v 2
  811. _
  812. cos(t) - v2
  813. y0 = u0 + ----------- * y1
  814. sin(t)
  815. ]]></artwork>
  816. </figure>
  817. As before, the transcendental constants may be replaced by fixed-point
  818. approximations without harming the reversibility property.
  819. </t>
  820. </section>
  821. <section anchor="four_point_transform" title="4-Point Transform">
  822. <t>
  823. Using the tools from the previous section, we can design a reversible integer
  824. four-point DCT approximation with uniform, orthonormal scaling.
  825. This requires 3&nbsp;multiplies, 9&nbsp;additions, and 2&nbsp;shifts (not
  826. counting the shift and rounding offset used in the fixed-point multiplies, as
  827. these are built into the multiplier).
  828. This is significantly cheaper than the&nbsp;<xref target="BE92"/> approach, and
  829. the output scaling is smaller by a factor of sqrt(2), saving half a bit per
  830. sample in the lossless case.
  831. By comparison, the four-point forward DCT approximation used in VP9, which is
  832. not reversible, uses 6&nbsp;multiplies, 6&nbsp;additions, and 2 shifts
  833. (counting shifts and rounding offsets which cannot be merged into a single
  834. multiply instruction on ARM).
  835. Four of its multipliers also require 28-bit accumulators, whereas this proposal
  836. can use much smaller multipliers without giving up the reversibility property.
  837. The total dynamic range expansion is 1&nbsp;bit: inputs in the range [-256,255)
  838. produce transformed values in the range [-512,510).
  839. This is the smallest dynamic range expansion possible for any reversible
  840. transform constructed from mostly-linear operations.
  841. It is possible to make reversible orthogonal transforms with no dynamic range
  842. expansion by using "piecewise-linear" rotations&nbsp;<xref target="SLD04"/>,
  843. but each step requires a large number of operations in a software
  844. implementation.
  845. </t>
  846. <t>
  847. Pseudo-code for the forward transform follows:
  848. <figure align="left">
  849. <artwork align="left"><![CDATA[
  850. Input: x0, x1, x2, x3
  851. Output: y0, y1, y2, y3
  852. /* Rotate (x3, x0) by -pi/4, asymmetrically scaled output. */
  853. t3 = x0 - x3
  854. t0 = x0 - (t3 >> 1)
  855. /* Rotate (x1, x2) by pi/4, asymmetrically scaled output. */
  856. t2 = x1 + x2
  857. t2h = t2 >> 1
  858. t1 = t2h - x2
  859. /* Rotate (t2, t0) by -pi/4, asymmetrically scaled input. */
  860. y0 = t0 + t2h
  861. y2 = y0 - t2
  862. /* Rotate (t3, t1) by 3*pi/8, asymmetrically scaled input. */
  863. t3 = t3 - (45*t1 + 32 >> 6)
  864. y1 = t1 + (21*t3 + 16 >> 5)
  865. y3 = t3 - (71*y1 + 32 >> 6)
  866. ]]></artwork>
  867. </figure>
  868. Even though there are three asymmetrically scaled rotations by pi/4, by careful
  869. arrangement we can share one of the shift operations (to help software
  870. implementations: shifts by a constant are basically free in hardware).
  871. This technique can be used to even greater effect in larger transforms.
  872. </t>
  873. <t>
  874. The inverse transform is constructed by simply undoing each step in turn:
  875. <figure align="left">
  876. <artwork align="left"><![CDATA[
  877. Input: y0, y1, y2, y3
  878. Output: x0, x1, x2, x3
  879. /* Rotate (y3, y1) by -3*pi/8, asymmetrically scaled output. */
  880. t3 = y3 + (71*y1 + 32 >> 6)
  881. t1 = y1 - (21*t3 + 16 >> 5)
  882. t3 = t3 + (45*t1 + 32 >> 6)
  883. /* Rotate (y2, y0) by pi/4, asymmetrically scaled output. */
  884. t2 = y0 - y2
  885. t2h = t2 >> 1
  886. t0 = y0 - t2h
  887. /* Rotate (t1, t2) by -pi/4, asymmetrically scaled input. */
  888. x2 = t2h - t1
  889. x1 = t2 - x2
  890. /* Rotate (x3, x0) by pi/4, asymmetrically scaled input. */
  891. x0 = t0 - (t3 >> 1)
  892. x3 = x0 - t3
  893. ]]></artwork>
  894. </figure>
  895. </t>
  896. <t>
  897. Although the right shifts make this transform non-linear, we can compute
  898. "basis functions" for it by sending a vector through it with a single value
  899. set to a large constant (256 was used here), and the rest of the values set to
  900. zero.
  901. The true basis functions for a four-point DCT (up to five digits) are
  902. <figure align="left">
  903. <artwork align="left"><![CDATA[
  904. [ y0 ] [ 0.50000 0.50000 0.50000 0.50000 ] [ x0 ]
  905. [ y1 ] = [ 0.65625 0.26953 -0.26953 -0.65625 ] [ x1 ]
  906. [ y2 ] [ 0.50000 -0.50000 -0.50000 0.50000 ] [ x2 ]
  907. [ y3 ] [ 0.27344 -0.65234 0.65234 -0.27344 ] [ x3 ]
  908. ]]></artwork>
  909. </figure>
  910. The corresponding basis functions for our reversible, integer DCT, computed
  911. using the approximation described above, are
  912. <figure align="left">
  913. <artwork align="left"><![CDATA[
  914. [ y0 ] [ 0.50000 0.50000 0.50000 0.50000 ] [ x0 ]
  915. [ y1 ] = [ 0.65328 0.27060 -0.27060 -0.65328 ] [ x1 ]
  916. [ y2 ] [ 0.50000 -0.50000 -0.50000 0.50000 ] [ x2 ]
  917. [ y3 ] [ 0.27060 -0.65328 0.65328 -0.27060 ] [ x3 ]
  918. ]]></artwork>
  919. </figure>
  920. The mean squared error (MSE) of the output, compared to a true DCT, can be
  921. computed with some assumptions about the input signal.
  922. Let G be the true DCT basis and G' be the basis for our integer approximation
  923. (computed as described above).
  924. Then the error in the transformed results is
  925. <figure align="left">
  926. <artwork align="left"><![CDATA[
  927. e = G.x - G'.x = (G - G').x = D.x
  928. ]]></artwork>
  929. </figure>
  930. where D&nbsp;=&nbsp;(G&nbsp;-&nbsp;G')&nbsp;.
  931. The MSE is then&nbsp;<xref target="Que98"/>
  932. <figure align="left">
  933. <artwork align="left"><![CDATA[
  934. 1 1
  935. - * E[e^T.e] = - * E[x^T.D^T.D.x]
  936. N N
  937. 1
  938. = - * E[tr(D.x.x^T.D^T)]
  939. N
  940. 1
  941. = - * E[tr(D.Rxx.D^T)]
  942. N
  943. ]]></artwork>
  944. </figure>
  945. where Rxx is the autocorrelation matrix of the input signal.
  946. Assuming the input is a zero-mean, first-order autoregressive (AR(1)) process
  947. gives an autocorrelation matrix of
  948. <figure align="left">
  949. <artwork align="left"><![CDATA[
  950. |i - j|
  951. Rxx[i,j] = rho
  952. ]]></artwork>
  953. </figure>
  954. for some correlation coefficient rho.
  955. A value of rho&nbsp;=&nbsp;0.95 is typical for image compression applications.
  956. Smaller values are more normal for motion-compensated frame differences, but
  957. this makes surprisingly little difference in transform design.
  958. Using the above procedure, the theoretical MSE of this approximation is
  959. 1.230E-6, which is below the level of the truncation error introduced by the
  960. right shift operations.
  961. This suggests the dynamic range of the input would have to be more than
  962. 20&nbsp;bits before it became worthwhile to increase the precision of the
  963. constants used in the multiplications to improve accuracy, though it may be
  964. worth using more precision to reduce bias.
  965. </t>
  966. </section>
  967. <section anchor="larger_transforms" title="Larger Transforms">
  968. <t>
  969. The same techniques can be applied to construct a reversible eight-point DCT
  970. approximation with uniform, orthonormal scaling using 15&nbsp;multiplies,
  971. 31&nbsp;additions, and 5&nbsp;shifts.
  972. It is possible to reduce this to 11&nbsp;multiplies and 29&nbsp;additions,
  973. which is the minimum number of multiplies possible for an eight-point DCT with
  974. uniform scaling&nbsp;<xref target="LLM89"/>, by introducing a scaling factor
  975. of sqrt(2), but this harms lossless performance.
  976. The dynamic range expansion is 1.5&nbsp;bits (again the smallest possible), and
  977. the MSE is 1.592E-06.
  978. By comparison, the eight-point transform in VP9 uses 12&nbsp;multiplications,
  979. 32&nbsp;additions, and 6 shifts.
  980. </t>
  981. <t>
  982. Similarly, we have constructed a reversible sixteen-point DCT approximation
  983. with uniform, orthonormal scaling using 33&nbsp;multiplies, 83&nbsp;additions,
  984. and 16&nbsp;shifts.
  985. This is just 2&nbsp;multiplies and 2&nbsp;additions more than the
  986. (non-reversible, non-integer, but uniformly scaled) factorization
  987. in&nbsp;<xref target="LLM89"/>.
  988. By comparison, the sixteen-point transform in VP9 uses 44&nbsp;multiplies,
  989. 88&nbsp;additions, and 18&nbsp;shifts.
  990. The dynamic range expansion is only 2&nbsp;bits (again the smallest possible),
  991. and the MSE is 1.495E-5.
  992. </t>
  993. <t>
  994. We also have a reversible 32-point DCT approximation with uniform,
  995. orthonormal scaling using 87&nbsp;multiplies, 215&nbsp;additions, and
  996. 38&nbsp;shifts.
  997. By comparison, the 32-point transform in VP9 uses 116&nbsp;multiplies,
  998. 194&nbsp;additions, and 66&nbsp;shifts.
  999. Our dynamic range expansion is still the minimal 2.5&nbsp;bits, and the MSE is
  1000. 8.006E-05
  1001. </t>
  1002. <t>
  1003. Code for all of these transforms is available in the development repository
  1004. listed in&nbsp;<xref target="development_repository"/>.
  1005. </t>
  1006. </section>
  1007. <section anchor="hadamard_transforms" title="Walsh-Hadamard Transforms">
  1008. <t>
  1009. These techniques can also be applied to constructing Walsh-Hadamard
  1010. Transforms, another useful transform family that is cheaper to implement than
  1011. the DCT (since it requires no multiplications at all).
  1012. The WHT has many applications as a cheap way to approximately change the time
  1013. and frequency resolution of a set of data (either individual bands, as in the
  1014. Opus audio codec, or whole blocks).
  1015. VP9 uses it as a reversible transform with uniform, orthonormal scaling for
  1016. lossless coding in place of its DCT, which does not have these properties.
  1017. </t>
  1018. <t>
  1019. Applying a 2x2 WHT to a block of 2x2 inputs involves running a 2-point WHT on
  1020. the rows, and then another 2-point WHT on the columns.
  1021. The basis functions for the 2-point WHT are, up to scaling, [1,&nbsp;1] and
  1022. [1,&nbsp;-1].
  1023. The four variations of a two-step lifer given in
  1024. <xref target="lifting_steps"/> are exactly the lifting steps needed to
  1025. implement a 2x2 WHT: two stages that produce asymmetrically scaled outputs
  1026. followed by two stages that consume asymmetrically scaled inputs.
  1027. <figure align="left">
  1028. <artwork align="left"><![CDATA[
  1029. Input: x00, x01, x10, x11
  1030. Output: y00, y01, y10, y11
  1031. /* Transform rows */
  1032. t1 = x00 - x01
  1033. t0 = x00 - (t1 >> 1) /* == (x00 + x01)/2 */
  1034. t2 = x10 + x11
  1035. t3 = (t2 >> 1) - x11 /* == (x10 - x11)/2 */
  1036. /* Transform columns */
  1037. y00 = t0 + (t2 >> 1) /* == (x00 + x01 + x10 + x11)/2 */
  1038. y10 = y00 - t2 /* == (x00 + x01 - x10 - x11)/2 */
  1039. y11 = (t1 >> 1) - t3 /* == (x00 - x01 - x10 + x11)/2 */
  1040. y01 = t1 - y11 /* == (x00 - x01 + x10 - x11)/2 */
  1041. ]]></artwork>
  1042. </figure>
  1043. </t>
  1044. <t>
  1045. By simply re-ordering the operations, we can see that there are two shifts that
  1046. may be shared between the two stages:
  1047. <figure align="left">
  1048. <artwork align="left"><![CDATA[
  1049. Input: x00, x01, x10, x11
  1050. Output: y00, y01, y10, y11
  1051. t1 = x00 - x01
  1052. t2 = x10 + x11
  1053. t0 = x00 - (t1 >> 1) /* == (x00 + x01)/2 */
  1054. y00 = t0 + (t2 >> 1) /* == (x00 + x01 + x10 + x11)/2 */
  1055. t3 = (t2 >> 1) - x11 /* == (x10 - x11)/2 */
  1056. y11 = (t1 >> 1) - t3 /* == (x00 - x01 - x10 + x11)/2 */
  1057. y10 = y00 - t2 /* == (x00 + x01 - x10 - x11)/2 */
  1058. y01 = t1 - y11 /* == (x00 - x01 + x10 - x11)/2 */
  1059. ]]></artwork>
  1060. </figure>
  1061. </t>
  1062. <t>
  1063. By eliminating the double-negation of x11 and re-ordering the additions to it,
  1064. we can see even more operations in common:
  1065. <figure align="left">
  1066. <artwork align="left"><![CDATA[
  1067. Input: x00, x01, x10, x11
  1068. Output: y00, y01, y10, y11
  1069. t1 = x00 - x01
  1070. t2 = x10 + x11
  1071. t0 = x00 - (t1 >> 1) /* == (x00 + x01)/2 */
  1072. y00 = t0 + (t2 >> 1) /* == (x00 + x01 + x10 + x11)/2 */
  1073. t3 = x11 + (t1 >> 1) /* == x11 + (x00 - x01)/2 */
  1074. y11 = t3 - (t2 >> 1) /* == (x00 - x01 - x10 + x11)/2 */
  1075. y10 = y00 - t2 /* == (x00 + x01 - x10 - x11)/2 */
  1076. y01 = t1 - y11 /* == (x00 - x01 + x10 - x11)/2 */
  1077. ]]></artwork>
  1078. </figure>
  1079. </t>
  1080. <t>
  1081. Simplifying further, the whole transform may be computed with just
  1082. 7&nbsp;additions and 1&nbsp;shift:
  1083. <figure align="left">
  1084. <artwork align="left"><![CDATA[
  1085. Input: x00, x01, x10, x11
  1086. Output: y00, y01, y10, y11
  1087. t1 = x00 - x01
  1088. t2 = x10 + x11
  1089. t4 = (t2 - t1) >> 1 /* == (-x00 + x01 + x10 + x11)/2 */
  1090. y00 = x00 + t4 /* == (x00 + x01 + x10 + x11)/2 */
  1091. y11 = x11 - t4 /* == (x00 - x01 - x10 + x11)/2 */
  1092. y10 = y00 - t2 /* == (x00 + x01 - x10 - x11)/2 */
  1093. y01 = t1 - y11 /* == (x00 - x01 + x10 - x11)/2 */
  1094. ]]></artwork>
  1095. </figure>
  1096. </t>
  1097. <t>
  1098. This is a significant savings over other approaches described in the
  1099. literature, which require 8&nbsp;additions, 2&nbsp;shifts, and
  1100. 1&nbsp;negation&nbsp;<xref target="FOIK99"/> (37.5%&nbsp;more operations), or
  1101. 10&nbsp;additions, 1&nbsp;shift, and
  1102. 2&nbsp;negations&nbsp;<xref target="TSSRM08"/> (62.5%&nbsp;more operations).
  1103. The same operations can be applied to compute a 4-point WHT in one dimension.
  1104. This implementation is used in this way in VP9's lossless mode.
  1105. Since larger WHTs may be trivially factored into multiple smaller WHTs, the
  1106. same approach can implement a reversible, orthonormally scaled WHT of any size
  1107. (2**N)x(2**M), so long as (N&nbsp;+&nbsp;M) is even.
  1108. </t>
  1109. </section>
  1110. </section>
  1111. <section anchor="development_repository" title="Development Repository">
  1112. <t>
  1113. The tools presented here were developed as part of Xiph.Org's Daala project.
  1114. They are available, along with many others in greater and lesser states of
  1115. maturity, in the Daala git repository at
  1116. <eref target="https://git.xiph.org/daala.git"/>.
  1117. See <eref target="https://xiph.org/daala/"/> for more information.
  1118. </t>
  1119. </section>
  1120. <section title="IANA Considerations">
  1121. <t>
  1122. This document has no actions for IANA.
  1123. </t>
  1124. </section>
  1125. <section anchor="Acknowledgments" title="Acknowledgments">
  1126. <t>
  1127. Thanks to Nathan Egge, Gregory Maxwell, and Jean-Marc Valin for their
  1128. assistance in the implementation and experimentation, and in preparing this
  1129. draft.
  1130. </t>
  1131. </section>
  1132. </middle>
  1133. <back>
  1134. <!--references title="Normative References">
  1135. <?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml"?>
  1136. </references-->
  1137. <references title="Informative References">
  1138. <?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.6386.xml"?>
  1139. <?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.6716.xml"?>
  1140. <reference anchor="BE92">
  1141. <front>
  1142. <title>New Networks for Perfect Inversion and Perfect Reconstruction</title>
  1143. <author initials="F.A.M.L." surname="Bruekers"
  1144. fullname="Fons A.M.L. Bruekers"/>
  1145. <author initials="A.W.M." surname="van den Enden"
  1146. fullname="Ad W.M. van den Enden"/>
  1147. <date month="January" year="1992"/>
  1148. </front>
  1149. <seriesInfo name="IEEE Journal on Selected Areas in Communication"
  1150. value="10(1):129--137"/>
  1151. </reference>
  1152. <reference anchor="FOIK99">
  1153. <front>
  1154. <title>Lossless 8-point Fast Discrete Cosine Transform Using Lossless
  1155. Hadamard Transform</title>
  1156. <author initials="S." surname="Fukuma" fullname="Shinji Fukuma"/>
  1157. <author initials="K." surname="Oyama" fullname="Koichi Oyama"/>
  1158. <author initials="M." surname="Iwahashi" fullname="Masahiro Iwahashi"/>
  1159. <author initials="N." surname="Kambayashi" fullname="Noriyoshi Kambayashi"/>
  1160. <date month="October" year="1999"/>
  1161. </front>
  1162. <seriesInfo name="Technical Report"
  1163. value="The Institute of Electronics, Information, and Communication Engineers
  1164. of Japan"/>
  1165. </reference>
  1166. <reference anchor="LLM89">
  1167. <front>
  1168. <title>Practical Fast 1-D DCT Algorithms with 11 Multiplications</title>
  1169. <author initials="C." surname="Loeffler" fullname="Christoph Loeffler"/>
  1170. <author initials="A." surname="Ligtenberg" fullname="Adriaan Ligtenberg"/>
  1171. <author initials="G.S." surname="Moschytz" fullname="George S. Moschytz"/>
  1172. <date month="May" year="1989"/>
  1173. </front>
  1174. <seriesInfo name="Proc. Acoustics, Speech, and Signal Processing (ICASSP'89)"
  1175. value="vol. 2, pp. 988--991"/>
  1176. </reference>
  1177. <reference anchor="Pas76">
  1178. <front>
  1179. <title>Source Coding Algorithms for Fast Data Compression</title>
  1180. <author initials="R.C." surname="Pasco" fullname="Richard C. Pasco"/>
  1181. <date month="May" year="1976"/>
  1182. </front>
  1183. <seriesInfo name="Ph.D. Thesis"
  1184. value="Dept. of Electrical Engineering, Stanford University"/>
  1185. </reference>
  1186. <reference anchor="PKA69">
  1187. <front>
  1188. <title>Hadamard Transform Image Coding</title>
  1189. <author initials="W.K." surname="Pratt" fullname="W.K. Pratt"/>
  1190. <author initials="J." surname="Kane" fullname="J. Kane"/>
  1191. <author initials="H.C." surname="Andrews" fullname="H.C. Andrews"/>
  1192. <date month="Jan" year="1969"/>
  1193. </front>
  1194. <seriesInfo name="Proc. IEEE" value="57(1):58--68"/>
  1195. </reference>
  1196. <reference anchor="Que98">
  1197. <front>
  1198. <title>On Unitary Transform Approximations</title>
  1199. <author initials="R.L." surname="de Queiroz" fullname="Ricardo L. de Queiroz"/>
  1200. <date month="Feb" year="1998"/>
  1201. </front>
  1202. <seriesInfo name="IEEE Signal Processing Letters" value="5(2):46--47"/>
  1203. </reference>
  1204. <reference anchor="SLD04">
  1205. <front>
  1206. <title>An Improved N-Bit to N-Bit Reversible Haar-Like Transform</title>
  1207. <author initials="J.G." surname="Senecal" fullname="Joshua G. Senecal"/>
  1208. <author initials="P." surname="Lindstrom" fullname="Peter Lindstrom"/>
  1209. <author initials="M.A." surname="Duchaineau" fullname="Mark A. Duchaineau"/>
  1210. <date month="October" year="2004"/>
  1211. </front>
  1212. <seriesInfo
  1213. name="Proc. of the 12th Pacific Conference on Computer Graphics and Applications (PG'04)"
  1214. value="pp. 371--380"/>
  1215. </reference>
  1216. <reference anchor="TSSRM08">
  1217. <front>
  1218. <title>Low-complexity Hierarchical Lapped Transform for Lossy-to-Lossless
  1219. Image Coding in JPEG XR/HD Photo</title>
  1220. <author initials="C." surname="Tu" fullname="Chengjie Tu"/>
  1221. <author initials="S." surname="Srinivasan" fullname="Sridhar Srinivasan"/>
  1222. <author initials="G.J." surname="Sullivan" fullname="Gary J. Sullivan"/>
  1223. <author initials="S." surname="Regunathan" fullname="Shankar Regunathan"/>
  1224. <author initials="H.S." surname="Malvar" fullname="Henrique S. Malvar"/>
  1225. <date month="August" year="2008"/>
  1226. </front>
  1227. <seriesInfo name="Applications of Digital Image Processing XXXI"
  1228. value="vol 7073"/>
  1229. </reference>
  1230. </references>
  1231. </back>
  1232. </rfc>