GzipViz.hs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. -- build: ghc --make -O3 GzipViz -main-is GzipViz.main -o gzipviz
  2. -- This program is a rough draft. See [BUG] below for security advisory
  3. {-# LANGUAGE ParallelListComp, ScopedTypeVariables #-}
  4. module GzipViz where
  5. import Data.List
  6. import Data.Char (chr, ord)
  7. import Data.Maybe (fromJust)
  8. import Data.Bits
  9. import Text.Printf
  10. import System.IO
  11. import System.Process
  12. ztext_example1 :: [Char]
  13. ztext_example1 = "1f8b0800" ++ "5ad32e5d" ++ "0203" ++ "5d92bd6ec3300c84f73ec5ed0d02b428daa163a73e862c3131015934442a7f4f5fca6ed2a48b2148e4dd77a4bf64a65a38729bc08a18726c39182598600c0782d2816ac8986b2f3526858dc19078b7a38a5d15ef3445e6fd687e31ca2459f6cdebb8605fa5cd7879dde0c2256edc204ddd2b9484896a6cf5fc89d4a8db5572673eb01a4790ab47d38d6b630a673885abb103b95ef77b4fa0ec25554a7752a39020bbe5ed432185748bafbb78635084ac82815cea31e92caa3ce4337494a3e723c88993d348819a57e1f96d83e3c899aed84ba9763c8f29c57bdd1251a6595a594812ebdcba019d3c1395484bec3e88e5709d451214b1d5dae57cb621e72dbeed1ff25c2971fc251e1c442a2d6b604f62fdb243f3a553fcc5be2da850b3bec8358fefb0ac41fd536f7bea339da8f4c177442e89dc2f3efe25749ae98ea3ab4ca2e62986653ee62eb2aa7b1f8b53c382bf6d9f7e00" ++ "445d9755" ++ "70020000"
  14. -- Only used to read hexdumped inputs
  15. hex2bin :: Num a => Char -> [a]
  16. hex2bin '0' = [0,0,0,0]
  17. hex2bin '1' = [1,0,0,0]
  18. hex2bin '2' = [0,1,0,0]
  19. hex2bin '3' = [1,1,0,0]
  20. hex2bin '4' = [0,0,1,0]
  21. hex2bin '5' = [1,0,1,0]
  22. hex2bin '6' = [0,1,1,0]
  23. hex2bin '7' = [1,1,1,0]
  24. hex2bin '8' = [0,0,0,1]
  25. hex2bin '9' = [1,0,0,1]
  26. hex2bin 'a' = [0,1,0,1]
  27. hex2bin 'b' = [1,1,0,1]
  28. hex2bin 'c' = [0,0,1,1]
  29. hex2bin 'd' = [1,0,1,1]
  30. hex2bin 'e' = [0,1,1,1]
  31. hex2bin 'f' = [1,1,1,1]
  32. hex2bin 'A' = [0,1,0,1]
  33. hex2bin 'B' = [1,1,0,1]
  34. hex2bin 'C' = [0,0,1,1]
  35. hex2bin 'D' = [1,0,1,1]
  36. hex2bin 'E' = [0,1,1,1]
  37. hex2bin 'F' = [1,1,1,1]
  38. hex2bin x = error ("not a hex digit: " ++ show x)
  39. char2bin :: Char -> [Int]
  40. char2bin b = [(ord b`shiftR`i) .&. 1 | i <- [0,1,2,3,4,5,6,7]]
  41. -- State of the Decompressor
  42. data State
  43. -- initial state
  44. = I0
  45. | B0 {b1_bfinal::Maybe Int,
  46. b1_btype::Maybe (Int, Int),
  47. b3_hlit::Maybe Int,
  48. b3_hdist::Maybe Int,
  49. b3_hclen::Maybe Int}
  50. | ReadLitLen {b3::State, clatree::Tree01 Int, litlen_code_cls::[(Int, Int)], litlen_read_next::Int}
  51. | ReadDists {b3::State, clatree::Tree01 Int, llctree::Tree01 LLC, dists_cls::[(Int, Int)], dists_read_next::Int}
  52. | ReadMain {b3::State, clatree::Tree01 Int, llctree::Tree01 LLC, disttree::Tree01 Int, decoded_prefix::[DecodedExtent]}
  53. | BlockDone {rs::State}
  54. -- "error" with reason + diagnostics
  55. | Err String String
  56. -- "not yet implemented"
  57. | Nyi String String
  58. deriving (Show)
  59. isErr :: State -> Bool
  60. isErr (Err _ _) = True
  61. isErr _ = False
  62. isNyi :: State -> Bool
  63. isNyi (Nyi _ _) = True
  64. isNyi _ = False
  65. data DecodedExtent = DE {de_numbits :: Int, de_text :: [Int]}
  66. deriving (Show)
  67. extents_to_string :: [DecodedExtent] -> [Int]
  68. extents_to_string [] = []
  69. extents_to_string ((DE _ w):t) = w ++ extents_to_string t
  70. data LLC = Literal Int | Stop | Lencode Int
  71. deriving Show
  72. toLLC :: Int -> LLC
  73. toLLC i | i < 256 = Literal i
  74. toLLC 256 = Stop
  75. toLLC i = Lencode i
  76. readlitlen_hlit :: State -> Int
  77. readlitlen_hlit = fromJust . b3_hlit . b3
  78. readdist_hdist :: State -> Int
  79. readdist_hdist = fromJust . b3_hdist . b3
  80. decode :: State -> [Int] -> (State, [Int])
  81. decode s [] = (Err "input too short" (show s), [])
  82. decode s@(Nyi _ _) x = (s, x)
  83. decode s@(Err _ _) x = (s, x)
  84. -- Skip fixed number of bytes. Inaccurate in general, headers may be longer (filename, other extensions)
  85. decode s@I0 bs
  86. | length bs < 8 * 10 = (Err "header too short" (show s), bs)
  87. | otherwise = decode (B0 Nothing Nothing Nothing Nothing Nothing) (drop (8 * 10) bs)
  88. decode s@(B0 Nothing Nothing _ _ _) (h:t) = (s { b1_bfinal = Just h }, t)
  89. -- Undefined block type
  90. decode s@(B0 (Just _) Nothing _ _ _) bs@(1:1:_) = (Err "BTYPE 0b11" (show s), bs)
  91. -- fixed-huff block, not yet implemented
  92. decode s@(B0 (Just _) (Just (0,1)) _ _ _) t = (Nyi "BTYPE 0b01" (show s), t)
  93. -- uncompressed block, not yet implemented
  94. decode s@(B0 (Just _) (Just (0,0)) _ _ _) t = (Nyi "BTYPE 0b00" (show s), t)
  95. decode s@(B0 (Just b) (Just (1,0)) Nothing Nothing Nothing) bs =
  96. case readValue 5 bs of
  97. Nothing -> (Err "input too short" (show s), bs)
  98. Just (v0, bs') ->
  99. case readValue 5 bs' of
  100. Nothing -> (Err "input too short" (show s), bs')
  101. Just (v1, bs'') ->
  102. case readValue 4 bs'' of
  103. Nothing -> (Err "input too short" (show s), bs'')
  104. Just (v2, t) ->
  105. (B0 (Just b) (Just (1,0)) (Just (v0+257)) (Just (v1+1)) (Just (v2+4)), t)
  106. decode s@(B0 (Just _) Nothing _ _ _) (h:i:t) = (s { b1_btype = Just (i,h) }, t)
  107. decode s@(B0 _ _ (Just _) (Just _) (Just hclen')) bs =
  108. if
  109. length bs < 3 * hclen' then (Err "input too short" (show s), bs)
  110. else
  111. let
  112. list = (sort (filter ((/=0).fst) (zip [bin2int (take 3 (drop (3*i) bs))| i <- [0..hclen'-1]] [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15])))
  113. tree = mkTree list
  114. in
  115. case tree of
  116. Nothing -> (Err "couldn't parse codelength alphabet tree" (show s), bs)
  117. Just tree' -> (ReadLitLen s tree' [] 0, drop (3*hclen') bs)
  118. decode s@(ReadLitLen s' theClatree codelengths i) bs
  119. | i == readlitlen_hlit s =
  120. case mkTree (map (\(x,y) -> (x,toLLC y)) (sort (filter ((/=0).fst) codelengths))) of
  121. Nothing -> (Err "couldn't parse literal/length alphabet tree" (show s), bs)
  122. Just tr -> (ReadDists s' theClatree tr codelengths 0, bs)
  123. decode s@(ReadLitLen s' theClatree codelengths i) bs =
  124. case readTree theClatree bs of
  125. Nothing -> (Err ("couldn't read with" ++ show theClatree) (show s), bs)
  126. Just (16, bs') -> if codelengths == []
  127. then (Err "error" (show s), bs)
  128. else case readValue 2 bs' of
  129. Nothing -> (Err "input too short" (show s), bs)
  130. Just (x, bs'') -> (ReadLitLen s' theClatree ([ ((fst.head $ codelengths), i+counter) | counter<-[0..3+x-1] ]++codelengths) (i+3+x), bs'')
  131. Just (17, bs') -> case readValue 3 bs' of
  132. Nothing -> (Err "input too short" (show s), bs)
  133. Just (x, bs'') -> (ReadLitLen s' theClatree ([ (0, i+counter) | counter<-[0..3+x-1] ]++codelengths) (i+3+x), bs'')
  134. Just (18, bs') -> case readValue 7 bs' of
  135. Nothing -> (Err "input too short" (show s), bs)
  136. Just (x, bs'') -> (ReadLitLen s' theClatree ([ (0, i+counter) | counter<-[0..11+x-1] ]++codelengths) (i+11+x), bs'')
  137. Just (v, bs') -> (ReadLitLen s' theClatree ((v,i):codelengths) (i+1), bs')
  138. decode s@(ReadDists s' theClatree theTheLlctree dists i) bs
  139. | i == readdist_hdist s =
  140. case mkTree (map (\(x,y) -> (x, y)) (sort (filter ((/=0).fst) (take (fromJust . b3_hdist . b3 $ s) dists)))) of
  141. Nothing -> (Err "couldn't parse hdist tree" (show s), bs)
  142. Just tr -> (ReadMain s' theClatree theTheLlctree tr [], bs)
  143. decode s@(ReadDists s' theClatree theLlctree dists i) bs =
  144. case readTree theClatree bs of
  145. Nothing -> (Err "couldn't read input" (show s), bs)
  146. Just (16, bs') -> if dists == []
  147. then (Err "no dists" (show s), bs')
  148. else case readValue 2 bs' of
  149. Nothing -> (Err "input too short" (show s), bs')
  150. Just (x, bs'') -> (ReadDists s' theClatree theLlctree ([ ((fst.head $ dists), i+counter) | counter<-[0..3+x-1] ]++dists) (i+3+x), bs'')
  151. Just (17, bs') -> case readValue 3 bs' of
  152. Nothing -> (Err "input too short" (show s), bs')
  153. Just (x, bs'') -> (ReadDists s' theClatree theLlctree ([ (0, i+counter) | counter<-[0..3+x-1] ]++dists) (i+3+x), bs'')
  154. Just (18, bs') -> case readValue 7 bs' of
  155. Nothing -> (Err "input too short" (show s), bs')
  156. Just (x, bs'') -> (ReadDists s' theClatree theLlctree ([ (0, i+counter) | counter<-[0..11+x-1] ]++dists) (i+11+x), bs'')
  157. Just (v, bs') -> (ReadDists s' theClatree theLlctree ((v,i):dists) (i+1), bs')
  158. -- TODO use a monad to consume the input stream - and for error handling
  159. decode s@(ReadMain _ _ theLlctree theDisttree theDecodedPrefix) bs =
  160. case readTree theLlctree bs of
  161. Nothing -> (Err "couldn't read literal/length code" (show s), bs)
  162. Just (Literal c, bs') -> (s { decoded_prefix = DE (length bs - length bs') [c]:theDecodedPrefix }, bs')
  163. Just (Stop, bs') -> (BlockDone s, bs')
  164. Just (Lencode l, bs') ->
  165. case readValue (lxbits l) bs' of
  166. Just (lx, bs'') ->
  167. let len = lenbase l + lx
  168. in case readTree theDisttree bs'' of
  169. Nothing -> (Err "coudln't read distance code" (show s), bs')
  170. Just (d, bs''') -> case readValue (distbits d) bs''' of
  171. Just (dx, bs'''') ->
  172. let dis = distbase d + dx
  173. relevant_rsuffix :: [Int] = take dis
  174. (extents_to_string theDecodedPrefix)
  175. (rx,sx) = divMod len (length relevant_rsuffix)
  176. w :: [Int] = drop (length relevant_rsuffix - sx) relevant_rsuffix ++ concat [relevant_rsuffix | _ <- [0..rx-1]]
  177. in
  178. (s { decoded_prefix = ((DE (length bs - length bs'''') w) : theDecodedPrefix) }, bs'''')
  179. Nothing -> (Err "input too short, couldn't read distance extra bits" (show s), bs''')
  180. Nothing -> (Err "input too short, couldn't read length extra bits" (show s), bs')
  181. -- decode s$(BlockDone s, bs) = (BlockDone s, bs) -- todo: allow multi-block streams
  182. decode s bs = (Nyi "catch-all" (show s), bs)
  183. distbits :: Int -> Int
  184. distbits d | d < 4 = 0
  185. distbits d | d < 6 = 1
  186. distbits d | d < 8 = 2
  187. distbits d | d < 10 = 3
  188. distbits d | d < 12 = 4
  189. distbits d | d < 14 = 5
  190. distbits d | d < 16 = 6
  191. distbits d | d < 18 = 7
  192. distbits d | d < 20 = 8
  193. distbits d | d < 22 = 9
  194. distbits d | d < 24 = 10
  195. distbits d | d < 26 = 11
  196. distbits d | d < 28 = 12
  197. distbits 28 = 13
  198. distbits 29 = 13
  199. distbits _ = error "no such distance extra bits code"
  200. distbase :: Int -> Int
  201. distbase d | d < 5 = d + 1
  202. distbase 4 = 5
  203. distbase 5 = 7
  204. distbase 6 = 9
  205. distbase 7 = 13
  206. distbase 8 = 17
  207. distbase 9 = 25
  208. distbase 10 = 33
  209. distbase 11 = 49
  210. distbase 12 = 65
  211. distbase 13 = 97
  212. distbase 14 = 129
  213. distbase 15 = 193
  214. distbase 16 = 257
  215. distbase 17 = 385
  216. distbase 18 = 513
  217. distbase 19 = 769
  218. distbase 20 = 1025
  219. distbase 21 = 1537
  220. distbase 22 = 2049
  221. distbase 23 = 3073
  222. distbase 24 = 4097
  223. distbase 25 = 6145
  224. distbase 26 = 8193
  225. distbase 27 = 12289
  226. distbase 28 = 16385
  227. distbase 29 = 24577
  228. distbase _ = error "no such distance code"
  229. lxbits :: Int -> Int
  230. lxbits l | l < 265 = 0
  231. lxbits l | l < 269 = 1
  232. lxbits l | l < 273 = 2
  233. lxbits l | l < 277 = 3
  234. lxbits l | l < 281 = 4
  235. lxbits l | l < 285 = 5
  236. lxbits 285 = 0
  237. lxbits _ = error "no such length extra bits code"
  238. lenbase :: Int -> Int
  239. lenbase l | l < 265 = l - 257 + 3
  240. lenbase 265 = 11
  241. lenbase 266 = 13
  242. lenbase 267 = 15
  243. lenbase 268 = 17
  244. lenbase 269 = 19
  245. lenbase 270 = 23
  246. lenbase 271 = 27
  247. lenbase 272 = 31
  248. lenbase 273 = 35
  249. lenbase 274 = 43
  250. lenbase 275 = 51
  251. lenbase 276 = 59
  252. lenbase 277 = 67
  253. lenbase 278 = 83
  254. lenbase 279 = 99
  255. lenbase 280 = 115
  256. lenbase 281 = 131
  257. lenbase 282 = 163
  258. lenbase 283 = 195
  259. lenbase 284 = 227
  260. lenbase 285 = 258
  261. lenbase _ = error "no such length code"
  262. data Tree01 a = Tip a | Tree {t0::Tree01 a, t1::Tree01 a}
  263. deriving Show
  264. -- @precondition: ls ascending by fst!!
  265. mkTree :: [(Int, a)] -> Maybe (Tree01 a)
  266. mkTree ls =
  267. case mkTree' ls of
  268. (t, []) -> Just t
  269. _ -> Nothing
  270. mkTree' :: [(Int, a)] -> (Tree01 a, [(Int, a)])
  271. mkTree' ((0,a):rest) = (Tip a, rest)
  272. mkTree' ls = (Tree left_hand_branch right_hand_branch, map (\(i,a) -> (i+1,a)) leftovers)
  273. where
  274. (left_hand_branch, rest_l) = mkTree' (map (\(i,a) -> (i-1,a)) ls)
  275. (right_hand_branch, leftovers) = mkTree' rest_l
  276. readValue nbits bs | nbits > length bs = Nothing
  277. readValue nbits bs = Just (bin2int (take nbits bs), drop nbits bs)
  278. readTree (Tip a) bs = Just (a, bs)
  279. readTree (Tree _ _) [] = Nothing
  280. readTree (Tree l _) (0:t) = readTree l t
  281. readTree (Tree _ r) (1:t) = readTree r t
  282. -- LSB first:
  283. bin2int :: [Int] -> Int
  284. bin2int bs = sum [b * 2^i | b <- bs | i <- [0::Int ..]]
  285. swaps :: [a] -> [a]
  286. swaps (h:i:t) = i:h:swaps t
  287. swaps x = x
  288. -- Limitations:
  289. -- * can't parse headers, extensions such as filename must not be present (a fixed number of bytes is skipped)
  290. -- * can only parse one BTYPE=dynamic-huffman block, nothing else
  291. -- (no consecutive blocks either: only one block, upto 32k of uncompressed data)
  292. -- * [BUG]: mkTree isn't robust against broken inputs (!!!)
  293. -- may loop indefinitely. This vulnerability must be fixed.
  294. -- don't use this as a CGI script just yet ... ;-)
  295. -- * visualization doesn't show (size of) headers and Huffman tables
  296. -- output goes into viz.html (truncating existing file if present)
  297. -- input is read from input.txt
  298. main :: IO ()
  299. main = do
  300. (_, Just ho1, _, hp1) <- createProcess (proc "/bin/gzip" ["-9", "-c", "-n", "input.txt"]) {std_out=CreatePipe}
  301. hSetEncoding ho1 latin1
  302. sOut <- hGetContents ho1
  303. _ <- waitForProcess hp1
  304. -- read bytes directly instead of hexdumping them first :-)
  305. let intxt :: [Char] = sOut
  306. -- concatMap ((\x -> printf "%02x" (x::Int)) . ord) sOut
  307. putStrLn (printf "gzip Compressed input length %d" (length intxt))
  308. -- print $ mkTree' [(3,0),(3,4),(3,5),(3,6),(3,7),(3,8),(3,17),(4,3),(5,18),(6,2),(6,16)]
  309. let (s, _) = last $ takeWhile (\(x,_) -> not (isErr x) && not (isNyi x))
  310. $ iterate (uncurry decode) (I0, concatMap char2bin intxt)
  311. -- concatMap hex2bin (swaps intxt))
  312. let dec = case s of
  313. BlockDone s' -> decoded_prefix s'
  314. ReadMain {} -> decoded_prefix s
  315. x -> error (show x)
  316. -- print $ show dec -- (decoded_prefix s)
  317. -- print $ map chr $ reverse (extents_to_string dec) -- (decoded_prefix s))
  318. putStrLn "done, writing output ..."
  319. handle <- openFile "viz.html" WriteMode
  320. hPutStr handle $ extentsToHtml (reverse dec)
  321. hClose handle
  322. putStrLn "done."
  323. -- TODO: spaces at the ends of display lines are not colored. How to achieve that in HTML/CSS?
  324. -- TODO more compact output using style definitions
  325. extentsToHtml :: [DecodedExtent] -> String
  326. extentsToHtml extents = "<html><head/><body><style>" ++ sout_style ++ "</style><div>" ++ aux extents ++ "</div></body></html>\n"
  327. where
  328. bpcs :: [Double] = map (\(DE b l) -> (fromIntegral b) / (fromIntegral (length l))) extents
  329. aux [] = []
  330. -- aux ((DE bits ints):t) = "<strike style=\" text-decoration: none; background-image: linear-gradient(transparent " ++ (show $ y bits ints) ++ "px, " ++ (c bits ints) ++ " 1em); \">" ++ concatMap saferUChr (reverse ints) ++ "</strike>" ++ aux t
  331. aux ((DE bits ints):t) = "<strike style=\" text-decoration: none; background-image: linear-gradient(" ++ (c bits ints) ++ " " ++ (show $ y bits ints) ++ "px, " ++ (c bits ints) ++ " 1em); \">" ++ concatMap saferUChr (reverse ints) ++ "</strike>" ++ aux t
  332. y _ _ = 0.0::Double
  333. bps bits ints = (fromIntegral bits) / (fromIntegral (length ints))
  334. c bits ints = printf "#%02x%02x%02x"
  335. ((floor $ 255.0 * ( bps bits ints / maximum bpcs ))::Int)
  336. ((floor $ 255.0 * (1.0 - bps bits ints / maximum bpcs ))::Int)
  337. -- (0::Int)
  338. (if length ints > 1 then 255::Int else 0::Int)
  339. saferUChr :: Int -> [Char]
  340. saferUChr i | i < 32 = "▒" -- light shade 1/4 block
  341. saferUChr i | chr i == '<' = "&lt;"
  342. saferUChr i | chr i == '>' = "&gt;"
  343. --saferUChr i | chr i == ' ' = " "
  344. saferUChr i = [chr i]
  345. -- unused functions
  346. org_aux [] = []
  347. org_aux ((x,y):t) =
  348. let (l,m) = splitPred' [] ((==x).fst) t
  349. in (x,y:map snd l):org_aux m
  350. splitPred' accum p (h:t) | p h = splitPred' (h:accum) p t
  351. splitPred' accum _ ls = (reverse accum, ls)
  352. sout_style =
  353. "body { background-color: black; color:#222; font-size: 2em; word-break: break-all; font-family: monospace, monospace; } "