9 سال پیش · 6dca30e067
--- a/bin/build.py
+++ b/bin/build.py
@@ -17,11 +17,21 @@ from bs4 import BeautifulSoup
 
				 import pypandoc
			
 
				 import json
			
 
				 import re
			
 
				+import pyphen
			
 
				+
			
 
				 
			
 
				 _BOOK_SRC = 'book-src'
			
 
				 _BUILD_d = "build"
			
 
				 # CUR_PATH = os.path.dirname(os.path.abspath(__file__))
			
 
				 
			
 
				+
			
 
				+# hyphenator
			
 
				+_H_FR = pyphen.Pyphen(lang='fr')
			
 
				+
			
 
				+_ERROR_PREF = '\033[1m[!!]\033[0m '
			
 
				+
			
 
				+
			
 
				+
			
 
				 print("Building book")
			
 
				 def main():
			
 
				    # clean build directory
			
@@ -29,6 +39,9 @@ def main():
 
				       shutil.rmtree(_BUILD_d, ignore_errors=True)
			
 
				    os.mkdir(_BUILD_d)
			
 
				 
			
 
				+   print('Hyphen has fr language')
			
 
				+   print('fr' in pyphen.LANGUAGES)
			
 
				+
			
 
				    parse_book(_BOOK_SRC)
			
 
				 
			
 
				 
			
@@ -91,22 +104,19 @@ def generate_html(book, toc):
 
				    # get story div
			
 
				    story_dom = template_dom.find('div', {"id":"flow-main"})
			
 
				 
			
 
				-   #
			
 
				    # loop through pages to convert them to html and add it to main html file
			
 
				-   # book_build_d_pages = os.path.join(_BUILD_d,'pages')
			
 
				-   # os.mkdir(book_build_d_pages)
			
 
				 
			
 
				    pi = 0
			
 
				-   for p in toc:
			
 
				+   for page in toc:
			
 
				       # print(toc[p])
			
 
				-      pagename = toc[p]['label']
			
 
				+      pagename = toc[page]['label']
			
 
				       pageid = re.sub('[^a-z0-9]+', '-', pagename.lower())
			
 
				-      print(pageid)
			
 
				+      print("\033[92m"+pageid+"\033[0m")
			
 
				 
			
 
				       # files
			
 
				-      in_f = os.path.join(_BOOK_SRC, toc[p]['file'])
			
 
				+      in_f = os.path.join(_BOOK_SRC, toc[page]['file'])
			
 
				       if not os.path.isfile(in_f):
			
 
				-         print("Source path is not a file, can't generate html : "+in_f)
			
 
				+         print(_ERROR_PREF+"Source path is not a file, can't generate html : "+in_f)
			
 
				          continue
			
 
				       # print('in_f : '+in_f)
			
 
				 
			
@@ -124,8 +134,20 @@ def generate_html(book, toc):
 
				                                extra_args=pdoc_args,
			
 
				                                filters=pdoc_filters)
			
 
				                               #  outputfile=out_f)
			
 
				+
			
 
				+      # print("output :\n"+output)
			
 
				+
			
 
				       output_dom = BeautifulSoup(output, 'html.parser')
			
 
				 
			
 
				+      # print("output_dom :")
			
 
				+      # print(output_dom)
			
 
				+
			
 
				+      # hyphenate paragraphes
			
 
				+      for node in output_dom.find_all('p'):
			
 
				+         hyphenate(node)
			
 
				+
			
 
				+      # print(str(output_dom))
			
 
				+
			
 
				       # copy images
			
 
				       for img in output_dom.find_all('img'):
			
 
				          # print('-- img ',img)
			
@@ -140,7 +162,7 @@ def generate_html(book, toc):
 
				          # print('- -  '+src_img)
			
 
				 
			
 
				          if not os.path.isfile(src_img):
			
 
				-            print("Source path is not a file, can't copy img : "+src_img)
			
 
				+            print(_ERROR_PREF+"Source path is not a file, can't copy img : \033[1m"+src_img+"\033[0m")
			
 
				             continue
			
 
				 
			
 
				          dest_img = os.path.join(_BUILD_d, att_src)
			
@@ -164,8 +186,72 @@ def generate_html(book, toc):
 
				    # create main html file from filled template html dom
			
 
				    book_html_f = os.path.join(_BUILD_d,'stories.html')
			
 
				    with open(book_html_f, 'w') as fp:
			
 
				-      fp.write(template_dom.prettify())
			
 
				+      fp.write(template_dom.prettify(formatter=None))
			
 
				+
			
 
				+def hyphenate(node):
			
 
				+   # print("hyphenate")
			
 
				+   nodetext = node.get_text()
			
 
				+   # print(nodetext)
			
 
				+   nodestr = str(node)
			
 
				+   # print(nodestr)
			
 
				+   for word in nodetext.split(' '):
			
 
				+
			
 
				+      # do not hyphenate if it's not a real word
			
 
				+      if len(word) < 5 or re.search('\w+', word) == None:
			
 
				+         continue
			
 
				+
			
 
				+      # cleaning word
			
 
				+      # remove all non-alphanumerical characteres duplicated or more
			
 
				+      word = re.sub('\W{2,}', '', word)
			
 
				+      # remove all non-alphanumerical at the begining of word
			
 
				+      word = re.sub('^\W', '', word)
			
 
				+      # remove all non-alphanumerical at the end of word
			
 
				+      word = re.sub('\W$', '', word)
			
 
				+
			
 
				+      # remove all word remaing having special chars
			
 
				+      if re.search('\W+', word):
			
 
				+         continue
			
 
				 
			
 
				+      # hyphenate word
			
 
				+      word_hyphenated = _H_FR.inserted(word)
			
 
				+      # remove hyphen precedeted by less than 3 letters
			
 
				+      word_hyphenated = re.sub(r'^(\w{,2})-', r'\1', word_hyphenated)
			
 
				+      # remove hyphen followed by less than 3 letters
			
 
				+      word_hyphenated = re.sub(r'-(\w{,2})$', r'\1', word_hyphenated)
			
 
				+      # replace scores by html elemt &shy;
			
 
				+      word_hyphenated = re.sub(r'(\w)-(\w)', r'\1&shy;\2', word_hyphenated)
			
 
				+      # replace double scores by score+$shy;
			
 
				+      word_hyphenated = re.sub(r'--', r'-&shy;', word_hyphenated)
			
 
				+      # TODO: attention au date 1950-1960, le tiret disparait
			
 
				+
			
 
				+      # print(word_hyphenated)
			
 
				+
			
 
				+      if re.search('\b+', word):
			
 
				+         print(word+" | "+word_hyphenated)
			
 
				+
			
 
				+      try:
			
 
				+         # replace word by hyhanated_word on source
			
 
				+         nodestr = re.sub(word, word_hyphenated, nodestr)
			
 
				+         # replaced_str_dom = BeautifulSoup(replaced_str, 'html.parser')
			
 
				+         # node.string = replaced_str
			
 
				+         # node.string.replace_with(node.string)
			
 
				+      except Exception as e:
			
 
				+         print(_ERROR_PREF+'Replacement error with \033[1m'+word+'\033[0m | \033[1m'+word_hyphenated+"\033[0m")
			
 
				+         print(e)
			
 
				+         print(node.string)
			
 
				+         print('[//]')
			
 
				+         pass
			
 
				+
			
 
				+   # add none breaking spaces
			
 
				+   nbspzr = ['"', '»', '«', '\.', '\!', '\?', ':', ';']
			
 
				+   for char in nbspzr:
			
 
				+      # print(char)
			
 
				+      nodestr = re.sub(r'('+char+')\s(\w)', r'\1&nbsp;\2', nodestr)
			
 
				+      nodestr = re.sub(r'(\w)\s('+char+')', r'\1&nbsp;\2', nodestr)
			
 
				+
			
 
				+   # print(nodestr)
			
 
				+   # replace node by hyphenated one
			
 
				+   node.replace_with(nodestr)
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":