Browse Source

added hyphenation to build.py

Bachir Soussi Chiadmi 6 years ago
parent
commit
6dca30e067
1 changed files with 96 additions and 10 deletions
  1. 96 10
      bin/build.py

+ 96 - 10
bin/build.py

@@ -17,11 +17,21 @@ from bs4 import BeautifulSoup
 import pypandoc
 import json
 import re
+import pyphen
+
 
 _BOOK_SRC = 'book-src'
 _BUILD_d = "build"
 # CUR_PATH = os.path.dirname(os.path.abspath(__file__))
 
+
+# hyphenator
+_H_FR = pyphen.Pyphen(lang='fr')
+
+_ERROR_PREF = '\033[1m[!!]\033[0m '
+
+
+
 print("Building book")
 def main():
    # clean build directory
@@ -29,6 +39,9 @@ def main():
       shutil.rmtree(_BUILD_d, ignore_errors=True)
    os.mkdir(_BUILD_d)
 
+   print('Hyphen has fr language')
+   print('fr' in pyphen.LANGUAGES)
+
    parse_book(_BOOK_SRC)
 
 
@@ -91,22 +104,19 @@ def generate_html(book, toc):
    # get story div
    story_dom = template_dom.find('div', {"id":"flow-main"})
 
-   #
    # loop through pages to convert them to html and add it to main html file
-   # book_build_d_pages = os.path.join(_BUILD_d,'pages')
-   # os.mkdir(book_build_d_pages)
 
    pi = 0
-   for p in toc:
+   for page in toc:
       # print(toc[p])
-      pagename = toc[p]['label']
+      pagename = toc[page]['label']
       pageid = re.sub('[^a-z0-9]+', '-', pagename.lower())
-      print(pageid)
+      print("\033[92m"+pageid+"\033[0m")
 
       # files
-      in_f = os.path.join(_BOOK_SRC, toc[p]['file'])
+      in_f = os.path.join(_BOOK_SRC, toc[page]['file'])
       if not os.path.isfile(in_f):
-         print("Source path is not a file, can't generate html : "+in_f)
+         print(_ERROR_PREF+"Source path is not a file, can't generate html : "+in_f)
          continue
       # print('in_f : '+in_f)
 
@@ -124,8 +134,20 @@ def generate_html(book, toc):
                                extra_args=pdoc_args,
                                filters=pdoc_filters)
                               #  outputfile=out_f)
+
+      # print("output :\n"+output)
+
       output_dom = BeautifulSoup(output, 'html.parser')
 
+      # print("output_dom :")
+      # print(output_dom)
+
+      # hyphenate paragraphes
+      for node in output_dom.find_all('p'):
+         hyphenate(node)
+
+      # print(str(output_dom))
+
       # copy images
       for img in output_dom.find_all('img'):
          # print('-- img ',img)
@@ -140,7 +162,7 @@ def generate_html(book, toc):
          # print('- -  '+src_img)
 
          if not os.path.isfile(src_img):
-            print("Source path is not a file, can't copy img : "+src_img)
+            print(_ERROR_PREF+"Source path is not a file, can't copy img : \033[1m"+src_img+"\033[0m")
             continue
 
          dest_img = os.path.join(_BUILD_d, att_src)
@@ -164,8 +186,72 @@ def generate_html(book, toc):
    # create main html file from filled template html dom
    book_html_f = os.path.join(_BUILD_d,'stories.html')
    with open(book_html_f, 'w') as fp:
-      fp.write(template_dom.prettify())
+      fp.write(template_dom.prettify(formatter=None))
+
+def hyphenate(node):
+   # print("hyphenate")
+   nodetext = node.get_text()
+   # print(nodetext)
+   nodestr = str(node)
+   # print(nodestr)
+   for word in nodetext.split(' '):
+
+      # do not hyphenate if it's not a real word
+      if len(word) < 5 or re.search('\w+', word) == None:
+         continue
+
+      # cleaning word
+      # remove all non-alphanumerical characteres duplicated or more
+      word = re.sub('\W{2,}', '', word)
+      # remove all non-alphanumerical at the begining of word
+      word = re.sub('^\W', '', word)
+      # remove all non-alphanumerical at the end of word
+      word = re.sub('\W$', '', word)
+
+      # remove all word remaing having special chars
+      if re.search('\W+', word):
+         continue
 
+      # hyphenate word
+      word_hyphenated = _H_FR.inserted(word)
+      # remove hyphen precedeted by less than 3 letters
+      word_hyphenated = re.sub(r'^(\w{,2})-', r'\1', word_hyphenated)
+      # remove hyphen followed by less than 3 letters
+      word_hyphenated = re.sub(r'-(\w{,2})$', r'\1', word_hyphenated)
+      # replace scores by html elemt &shy;
+      word_hyphenated = re.sub(r'(\w)-(\w)', r'\1&shy;\2', word_hyphenated)
+      # replace double scores by score+$shy;
+      word_hyphenated = re.sub(r'--', r'-&shy;', word_hyphenated)
+      # TODO: attention au date 1950-1960, le tiret disparait
+
+      # print(word_hyphenated)
+
+      if re.search('\b+', word):
+         print(word+" | "+word_hyphenated)
+
+      try:
+         # replace word by hyhanated_word on source
+         nodestr = re.sub(word, word_hyphenated, nodestr)
+         # replaced_str_dom = BeautifulSoup(replaced_str, 'html.parser')
+         # node.string = replaced_str
+         # node.string.replace_with(node.string)
+      except Exception as e:
+         print(_ERROR_PREF+'Replacement error with \033[1m'+word+'\033[0m | \033[1m'+word_hyphenated+"\033[0m")
+         print(e)
+         print(node.string)
+         print('[//]')
+         pass
+
+   # add none breaking spaces
+   nbspzr = ['"', '»', '«', '\.', '\!', '\?', ':', ';']
+   for char in nbspzr:
+      # print(char)
+      nodestr = re.sub(r'('+char+')\s(\w)', r'\1&nbsp;\2', nodestr)
+      nodestr = re.sub(r'(\w)\s('+char+')', r'\1&nbsp;\2', nodestr)
+
+   # print(nodestr)
+   # replace node by hyphenated one
+   node.replace_with(nodestr)
 
 
 if __name__ == "__main__":