bachir
/
gitbook-html2print


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
							#!/usr/bin/python
# -*- coding: utf-8 -*-


# @Author: Bachir Soussi Chiadmi <bach>
# @Date:   27-03-2017
# @Email:  bachir@figureslibres.io
# @Last modified by:   bach
# @Last modified time: 21-04-2017
# @License: GPL-V3


import sys, os, shutil
import markdown
# import mistune
from bs4 import BeautifulSoup
import pypandoc
import json
import re
import pyphen


_BOOK_SRC = 'book-src'
_BUILD_d = "build"
# CUR_PATH = os.path.dirname(os.path.abspath(__file__))


# hyphenator
_H_FR = pyphen.Pyphen(lang='fr')

_ERROR_PREF = '\033[1m[!!]\033[0m '


print("Building book")
def main():
   # clean build directory
   if not os.path.isdir(_BUILD_d):
      # shutil.rmtree(_BUILD_d, ignore_errors=True)
      os.mkdir(_BUILD_d)

   if os.path.isdir(os.path.join(_BUILD_d, 'assets')):
      shutil.rmtree(os.path.join(_BUILD_d, 'assets'), ignore_errors=True)

   if os.path.isfile(os.path.join(_BUILD_d, 'stories.html')):
      shutil.rmtree(os.path.join(_BUILD_d, 'stories.html'), ignore_errors=True)

   print('Hyphen has fr language')
   print('fr' in pyphen.LANGUAGES)

   parse_book(_BOOK_SRC)


def parse_book(book):
   # book_name = book.replace('.git', '')
   # print("- - -")
   print("Parse book")
   # print("- - -")

   # table of content (ordered list of markdown files)
   sum_p = os.path.join(_BOOK_SRC, "SUMMARY.md")
   if not os.path.isfile(sum_p):
      print("No summary file, can't generate html")
      return

   sum_f = open(sum_p)
   sum_str = sum_f.read()
   # print(sum_str)
   # convert md to html
   sum_html = markdown.markdown(sum_str)
   # print(sum_html)
   # create dom from html string (as it will be parsable)
   sum_dom = BeautifulSoup(sum_html, 'html.parser')
   # print(sum_dom)
   # parse html dom to get file list in the right order
   toc = parse_summary(sum_dom.ul, {})
   # print(toc)

   # generate final html build for html2print
   generate_html(book, toc)

def parse_summary(ul, toc):
   print("Parse summary")
   i=0
   for li in ul.find_all('li',recursive=False):
      # print('li')
      for a in li.find_all('a',recursive=False):
         # print(a.get_text(strip=True))
         # print(a['href'])
         href = a['href']
         href = re.sub(r'^/', '', href)
         toc[i] = {
            'label':a.get_text(strip=True),
            'file':href
         }
      i = i+1

   return toc


def generate_html(book, toc):
   print("Generate html")
   #
   # create main html dom from template
   template_f = open("templates/main.tpl.html", "r")
   template_html = template_f.read()
   template_dom = BeautifulSoup(template_html, 'html.parser')
   # replace title
   # template_dom.html.head.title.contents[0].replaceWith(book_name)
   # get story div
   story_dom = template_dom.find('div', {"id":"flow-main"})

   # loop through pages to convert them to html and add it to main html file

   pi = 0
   for page in toc:
      # print(toc[p])
      pagename = toc[page]['label']
      pageid = re.sub('[^a-z0-9]+', '-', pagename.lower())
      print("\033[92m"+pageid+"\033[0m")

      # files
      in_f = os.path.join(_BOOK_SRC, toc[page]['file'])
      if not os.path.isfile(in_f):
         print(_ERROR_PREF+"Source path is not a file, can't generate html : "+in_f)
         continue
      # print('in_f : '+in_f)

      # out_f = os.path.join(book_build_d_pages, toc[p]['file'].replace('/', '-').replace('.md', '.html'))
      # print('out_f : '+out_f)

      pdoc_args = ['--mathjax',
                   '--smart']

      pdoc_filters = []

      output = pypandoc.convert_file(in_f,
                               to='html5',
                               format='markdown+header_attributes+link_attributes+bracketed_spans',
                               extra_args=pdoc_args,
                               filters=pdoc_filters)
                              #  outputfile=out_f)

      # print("output :\n"+output)

      output_dom = BeautifulSoup(output, 'html.parser')

      # print("output_dom :")
      # print(output_dom)

      # hyphenate paragraphes
      for node in output_dom.find_all('p'):
         hyphenate(node)

      # print(str(output_dom))

      # copy images
      for img in output_dom.find_all('img'):
         # print('-- img ',img)
         att_src = re.sub(r"^\/", "", img['src'])
         img['src'] = att_src
         # domimg = output_dom.find('img', {'src':img['src']})
         # domimg['src'] = att_src
         # print(domimg)


         src_img = os.path.join(_BOOK_SRC, att_src)
         # print('- -  '+src_img)

         if not os.path.isfile(src_img):
            print(_ERROR_PREF+"Source path is not a file, can't copy img : \033[1m"+src_img+"\033[0m")
            continue

         dest_img = os.path.join(_BUILD_d, att_src)
         # print('- -  '+dest_img)

         dest_path, dest_file = os.path.split(dest_img)
         if not os.path.isdir(dest_path):
            os.makedirs(dest_path)

         shutil.copyfile(src_img, dest_img)


      # append html story page to template_dom
      story_page = BeautifulSoup('<div class="story-page story-page-'+str(pi)+'" id="'+pageid+'"></div>', 'html.parser')
      story_page.div.append(output_dom)
      story_dom.append(story_page)


      pi = pi+1

   # create main html file from filled template html dom
   book_html_f = os.path.join(_BUILD_d,'stories.html')
   with open(book_html_f, 'w') as fp:
      fp.write(template_dom.prettify(formatter=None))

def hyphenate(node):
   # print("hyphenate")
   nodetext = node.get_text()
   # print(nodetext)
   nodestr = str(node)
   # print(nodestr)
   for word in nodetext.split(' '):

      # do not hyphenate if it's not a real word
      if len(word) < 5 or re.search('\w+', word) == None:
         continue

      # cleaning word
      # remove all non-alphanumerical characteres duplicated or more
      word = re.sub('\W{2,}', '', word)
      # remove all non-alphanumerical at the begining of word
      word = re.sub('^\W', '', word)
      # remove all non-alphanumerical at the end of word
      word = re.sub('\W$', '', word)

      # remove all word remaing having special chars
      if re.search('\W+', word):
         continue

      # hyphenate word
      word_hyphenated = _H_FR.inserted(word)
      # remove hyphen precedeted by less than 3 letters
      word_hyphenated = re.sub(r'^(\w{,2})-', r'\1', word_hyphenated)
      # remove hyphen followed by less than 3 letters
      word_hyphenated = re.sub(r'-(\w{,2})$', r'\1', word_hyphenated)
      # replace scores by html elemt &shy;
      word_hyphenated = re.sub(r'(\w)-(\w)', r'\1&shy;\2', word_hyphenated)
      # replace double scores by score+$shy;
      word_hyphenated = re.sub(r'--', r'-&shy;', word_hyphenated)
      # TODO: attention au date 1950-1960, le tiret disparait

      # print(word_hyphenated)

      if re.search('\b+', word):
         print(word+" | "+word_hyphenated)

      try:
         # replace word by hyhanated_word on source
         nodestr = re.sub(word, word_hyphenated, nodestr)
         # replaced_str_dom = BeautifulSoup(replaced_str, 'html.parser')
         # node.string = replaced_str
         # node.string.replace_with(node.string)
      except Exception as e:
         print(_ERROR_PREF+'Replacement error with \033[1m'+word+'\033[0m | \033[1m'+word_hyphenated+"\033[0m")
         print(e)
         print(node.string)
         print('[//]')
         pass

   # add none breaking spaces
   nbspzr_before = ['»', '\!', '\?', ':', ';']
   for char in nbspzr_before:
      nodestr = re.sub(r'(\w|>)\s('+char+')', r'\1&nbsp;\2', nodestr)

   nbspzr_after = ['«']
   for char in nbspzr_after:
      nodestr = re.sub(r'('+char+')\s(\w|<)', r'\1&nbsp;\2', nodestr)

   # print(nodestr)
   # replace node by hyphenated one
   node.replace_with(nodestr)


if __name__ == "__main__":
   main()