123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- #!/usr/bin/env python3
- # pdf2htmlEX creates images inline the HTML as <img src="data:" />
- # This script:
- # - removes the src="" attribute
- # - creates a new bg-N class for the img tag
- # - copies the images into the CSS
- import bleach
- import glob
- import os
- import re
- import sys
- from pyquery import PyQuery as pq
- # Must be given a HTML file (output of pdf2htmlEX)
- if len (sys.argv) == 1:
- exit ()
- # Which item to process
- html_file = sys.argv[1]
- # The CSS file of the document
- css_file = html_file[:-5] + '.images.css'
- if not os.path.isfile (html_file):
- print ("Input file doesn't exist in the library.")
- exit ()
- # Store all the base64 images here
- images = {}
- # Remove base64 image from <img src="">
- def background_to_css (index, element):
- new_css_class = 'bg-' + str (index)
-
- images['.' + new_css_class] = element.attr.src
-
- element.remove_attr ('alt')
- element.remove_attr ('src')
- element.add_class (new_css_class)
- # Read HTML file
- with open (html_file, 'rt', encoding='utf-8') as f:
- dom = pq (f.read ())
- # Open sidebar
- if len (dom ('#outline > ul > li')) > 0:
- dom ('#sidebar').addClass ('opened')
- # Loop all images
- dom ('#page-container img.bi').each (lambda index, item: background_to_css (index, dom (item)))
- # Overwrite HTML file with removed images
- with open (html_file, 'wt', encoding='utf-8') as f:
- # f.write (dom ('#sidebar').outer_html ())
- f.write (dom ('#page-container').outer_html ())
- # Append new CSS classes (with the images) to the document CSS file
- with open (css_file, 'at', encoding='utf-8') as f:
- for image, base64 in images.items ():
- f.write (image + '{')
- f.write ('background-image: url(' + base64 + ');')
- f.write ('background-position: center;')
- f.write ('background-repeat: no-repeat;')
- f.write ('background-size: cover;')
- f.write ('}')
|