|
@@ -17,11 +17,21 @@ from bs4 import BeautifulSoup
|
|
|
import pypandoc
|
|
|
import json
|
|
|
import re
|
|
|
+import pyphen
|
|
|
+
|
|
|
|
|
|
_BOOK_SRC = 'book-src'
|
|
|
_BUILD_d = "build"
|
|
|
# CUR_PATH = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
|
+
|
|
|
+# hyphenator
|
|
|
+_H_FR = pyphen.Pyphen(lang='fr')
|
|
|
+
|
|
|
+_ERROR_PREF = '\033[1m[!!]\033[0m '
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
print("Building book")
|
|
|
def main():
|
|
|
# clean build directory
|
|
@@ -29,6 +39,9 @@ def main():
|
|
|
shutil.rmtree(_BUILD_d, ignore_errors=True)
|
|
|
os.mkdir(_BUILD_d)
|
|
|
|
|
|
+ print('Hyphen has fr language')
|
|
|
+ print('fr' in pyphen.LANGUAGES)
|
|
|
+
|
|
|
parse_book(_BOOK_SRC)
|
|
|
|
|
|
|
|
@@ -91,22 +104,19 @@ def generate_html(book, toc):
|
|
|
# get story div
|
|
|
story_dom = template_dom.find('div', {"id":"flow-main"})
|
|
|
|
|
|
- #
|
|
|
# loop through pages to convert them to html and add it to main html file
|
|
|
- # book_build_d_pages = os.path.join(_BUILD_d,'pages')
|
|
|
- # os.mkdir(book_build_d_pages)
|
|
|
|
|
|
pi = 0
|
|
|
- for p in toc:
|
|
|
+ for page in toc:
|
|
|
# print(toc[p])
|
|
|
- pagename = toc[p]['label']
|
|
|
+ pagename = toc[page]['label']
|
|
|
pageid = re.sub('[^a-z0-9]+', '-', pagename.lower())
|
|
|
- print(pageid)
|
|
|
+ print("\033[92m"+pageid+"\033[0m")
|
|
|
|
|
|
# files
|
|
|
- in_f = os.path.join(_BOOK_SRC, toc[p]['file'])
|
|
|
+ in_f = os.path.join(_BOOK_SRC, toc[page]['file'])
|
|
|
if not os.path.isfile(in_f):
|
|
|
- print("Source path is not a file, can't generate html : "+in_f)
|
|
|
+ print(_ERROR_PREF+"Source path is not a file, can't generate html : "+in_f)
|
|
|
continue
|
|
|
# print('in_f : '+in_f)
|
|
|
|
|
@@ -124,8 +134,20 @@ def generate_html(book, toc):
|
|
|
extra_args=pdoc_args,
|
|
|
filters=pdoc_filters)
|
|
|
# outputfile=out_f)
|
|
|
+
|
|
|
+ # print("output :\n"+output)
|
|
|
+
|
|
|
output_dom = BeautifulSoup(output, 'html.parser')
|
|
|
|
|
|
+ # print("output_dom :")
|
|
|
+ # print(output_dom)
|
|
|
+
|
|
|
+ # hyphenate paragraphes
|
|
|
+ for node in output_dom.find_all('p'):
|
|
|
+ hyphenate(node)
|
|
|
+
|
|
|
+ # print(str(output_dom))
|
|
|
+
|
|
|
# copy images
|
|
|
for img in output_dom.find_all('img'):
|
|
|
# print('-- img ',img)
|
|
@@ -140,7 +162,7 @@ def generate_html(book, toc):
|
|
|
# print('- - '+src_img)
|
|
|
|
|
|
if not os.path.isfile(src_img):
|
|
|
- print("Source path is not a file, can't copy img : "+src_img)
|
|
|
+ print(_ERROR_PREF+"Source path is not a file, can't copy img : \033[1m"+src_img+"\033[0m")
|
|
|
continue
|
|
|
|
|
|
dest_img = os.path.join(_BUILD_d, att_src)
|
|
@@ -164,8 +186,72 @@ def generate_html(book, toc):
|
|
|
# create main html file from filled template html dom
|
|
|
book_html_f = os.path.join(_BUILD_d,'stories.html')
|
|
|
with open(book_html_f, 'w') as fp:
|
|
|
- fp.write(template_dom.prettify())
|
|
|
+ fp.write(template_dom.prettify(formatter=None))
|
|
|
+
|
|
|
+def hyphenate(node):
|
|
|
+ # print("hyphenate")
|
|
|
+ nodetext = node.get_text()
|
|
|
+ # print(nodetext)
|
|
|
+ nodestr = str(node)
|
|
|
+ # print(nodestr)
|
|
|
+ for word in nodetext.split(' '):
|
|
|
+
|
|
|
+ # do not hyphenate if it's not a real word
|
|
|
+ if len(word) < 5 or re.search('\w+', word) == None:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # cleaning word
|
|
|
+ # remove all non-alphanumerical characteres duplicated or more
|
|
|
+ word = re.sub('\W{2,}', '', word)
|
|
|
+ # remove all non-alphanumerical at the begining of word
|
|
|
+ word = re.sub('^\W', '', word)
|
|
|
+ # remove all non-alphanumerical at the end of word
|
|
|
+ word = re.sub('\W$', '', word)
|
|
|
+
|
|
|
+ # remove all word remaing having special chars
|
|
|
+ if re.search('\W+', word):
|
|
|
+ continue
|
|
|
|
|
|
+ # hyphenate word
|
|
|
+ word_hyphenated = _H_FR.inserted(word)
|
|
|
+ # remove hyphen precedeted by less than 3 letters
|
|
|
+ word_hyphenated = re.sub(r'^(\w{,2})-', r'\1', word_hyphenated)
|
|
|
+ # remove hyphen followed by less than 3 letters
|
|
|
+ word_hyphenated = re.sub(r'-(\w{,2})$', r'\1', word_hyphenated)
|
|
|
+ # replace scores by html elemt ­
|
|
|
+ word_hyphenated = re.sub(r'(\w)-(\w)', r'\1­\2', word_hyphenated)
|
|
|
+ # replace double scores by score+$shy;
|
|
|
+ word_hyphenated = re.sub(r'--', r'-­', word_hyphenated)
|
|
|
+ # TODO: attention au date 1950-1960, le tiret disparait
|
|
|
+
|
|
|
+ # print(word_hyphenated)
|
|
|
+
|
|
|
+ if re.search('\b+', word):
|
|
|
+ print(word+" | "+word_hyphenated)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # replace word by hyhanated_word on source
|
|
|
+ nodestr = re.sub(word, word_hyphenated, nodestr)
|
|
|
+ # replaced_str_dom = BeautifulSoup(replaced_str, 'html.parser')
|
|
|
+ # node.string = replaced_str
|
|
|
+ # node.string.replace_with(node.string)
|
|
|
+ except Exception as e:
|
|
|
+ print(_ERROR_PREF+'Replacement error with \033[1m'+word+'\033[0m | \033[1m'+word_hyphenated+"\033[0m")
|
|
|
+ print(e)
|
|
|
+ print(node.string)
|
|
|
+ print('[//]')
|
|
|
+ pass
|
|
|
+
|
|
|
+ # add none breaking spaces
|
|
|
+ nbspzr = ['"', '»', '«', '\.', '\!', '\?', ':', ';']
|
|
|
+ for char in nbspzr:
|
|
|
+ # print(char)
|
|
|
+ nodestr = re.sub(r'('+char+')\s(\w)', r'\1 \2', nodestr)
|
|
|
+ nodestr = re.sub(r'(\w)\s('+char+')', r'\1 \2', nodestr)
|
|
|
+
|
|
|
+ # print(nodestr)
|
|
|
+ # replace node by hyphenated one
|
|
|
+ node.replace_with(nodestr)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|