#!/usr/bin/python # -*- coding: utf-8 -*- # @Author: Bachir Soussi Chiadmi # @Date: 27-03-2017 # @Email: bachir@figureslibres.io # @Last modified by: bach # @Last modified time: 21-04-2017 # @License: GPL-V3 import sys, os, shutil import markdown # import mistune from bs4 import BeautifulSoup import pypandoc import json import re import pyphen _BOOK_SRC = 'book-src' _BUILD_d = "build" # CUR_PATH = os.path.dirname(os.path.abspath(__file__)) # hyphenator _H_FR = pyphen.Pyphen(lang='fr') _ERROR_PREF = '\033[1m[!!]\033[0m ' print("Building book") def main(): # clean build directory if not os.path.isdir(_BUILD_d): # shutil.rmtree(_BUILD_d, ignore_errors=True) os.mkdir(_BUILD_d) if os.path.isdir(os.path.join(_BUILD_d, 'assets')): shutil.rmtree(os.path.join(_BUILD_d, 'assets'), ignore_errors=True) if os.path.isfile(os.path.join(_BUILD_d, 'stories.html')): shutil.rmtree(os.path.join(_BUILD_d, 'stories.html'), ignore_errors=True) print('Hyphen has fr language') print('fr' in pyphen.LANGUAGES) parse_book(_BOOK_SRC) def parse_book(book): # book_name = book.replace('.git', '') # print("- - -") print("Parse book") # print("- - -") # table of content (ordered list of markdown files) sum_p = os.path.join(_BOOK_SRC, "SUMMARY.md") if not os.path.isfile(sum_p): print("No summary file, can't generate html") return sum_f = open(sum_p) sum_str = sum_f.read() # print(sum_str) # convert md to html sum_html = markdown.markdown(sum_str) # print(sum_html) # create dom from html string (as it will be parsable) sum_dom = BeautifulSoup(sum_html, 'html.parser') # print(sum_dom) # parse html dom to get file list in the right order toc = parse_summary(sum_dom.ul, {}) # print(toc) # generate final html build for html2print generate_html(book, toc) def parse_summary(ul, toc): print("Parse summary") i=0 for li in ul.find_all('li',recursive=False): # print('li') for a in li.find_all('a',recursive=False): # print(a.get_text(strip=True)) # print(a['href']) href = a['href'] href = re.sub(r'^/', '', href) toc[i] = { 'label':a.get_text(strip=True), 'file':href } i = i+1 return toc def generate_html(book, toc): print("Generate html") # # create main html dom from template template_f = open("templates/main.tpl.html", "r") template_html = template_f.read() template_dom = BeautifulSoup(template_html, 'html.parser') # replace title # template_dom.html.head.title.contents[0].replaceWith(book_name) # get story div story_dom = template_dom.find('div', {"id":"flow-main"}) # loop through pages to convert them to html and add it to main html file pi = 0 for page in toc: # print(toc[p]) pagename = toc[page]['label'] pageid = re.sub('[^a-z0-9]+', '-', pagename.lower()) print("\033[92m"+pageid+"\033[0m") # files in_f = os.path.join(_BOOK_SRC, toc[page]['file']) if not os.path.isfile(in_f): print(_ERROR_PREF+"Source path is not a file, can't generate html : "+in_f) continue # print('in_f : '+in_f) # out_f = os.path.join(book_build_d_pages, toc[p]['file'].replace('/', '-').replace('.md', '.html')) # print('out_f : '+out_f) pdoc_args = ['--mathjax', '--smart'] pdoc_filters = [] output = pypandoc.convert_file(in_f, to='html5', format='markdown+header_attributes+link_attributes+bracketed_spans', extra_args=pdoc_args, filters=pdoc_filters) # outputfile=out_f) # print("output :\n"+output) output_dom = BeautifulSoup(output, 'html.parser') # print("output_dom :") # print(output_dom) # hyphenate paragraphes for node in output_dom.find_all('p'): hyphenate(node) # print(str(output_dom)) # copy images for img in output_dom.find_all('img'): # print('-- img ',img) att_src = re.sub(r"^\/", "", img['src']) img['src'] = att_src # domimg = output_dom.find('img', {'src':img['src']}) # domimg['src'] = att_src # print(domimg) src_img = os.path.join(_BOOK_SRC, att_src) # print('- - '+src_img) if not os.path.isfile(src_img): print(_ERROR_PREF+"Source path is not a file, can't copy img : \033[1m"+src_img+"\033[0m") continue dest_img = os.path.join(_BUILD_d, att_src) # print('- - '+dest_img) dest_path, dest_file = os.path.split(dest_img) if not os.path.isdir(dest_path): os.makedirs(dest_path) shutil.copyfile(src_img, dest_img) # append html story page to template_dom story_page = BeautifulSoup('
', 'html.parser') story_page.div.append(output_dom) story_dom.append(story_page) pi = pi+1 # create main html file from filled template html dom book_html_f = os.path.join(_BUILD_d,'stories.html') with open(book_html_f, 'w') as fp: fp.write(template_dom.prettify(formatter=None)) def hyphenate(node): # print("hyphenate") nodetext = node.get_text() # print(nodetext) nodestr = str(node) # print(nodestr) for word in nodetext.split(' '): # do not hyphenate if it's not a real word if len(word) < 5 or re.search('\w+', word) == None: continue # cleaning word # remove all non-alphanumerical characteres duplicated or more word = re.sub('\W{2,}', '', word) # remove all non-alphanumerical at the begining of word word = re.sub('^\W', '', word) # remove all non-alphanumerical at the end of word word = re.sub('\W$', '', word) # remove all word remaing having special chars if re.search('\W+', word): continue # hyphenate word word_hyphenated = _H_FR.inserted(word) # remove hyphen precedeted by less than 3 letters word_hyphenated = re.sub(r'^(\w{,2})-', r'\1', word_hyphenated) # remove hyphen followed by less than 3 letters word_hyphenated = re.sub(r'-(\w{,2})$', r'\1', word_hyphenated) # replace scores by html elemt ­ word_hyphenated = re.sub(r'(\w)-(\w)', r'\1­\2', word_hyphenated) # replace double scores by score+$shy; word_hyphenated = re.sub(r'--', r'-­', word_hyphenated) # TODO: attention au date 1950-1960, le tiret disparait # print(word_hyphenated) if re.search('\b+', word): print(word+" | "+word_hyphenated) try: # replace word by hyhanated_word on source nodestr = re.sub(word, word_hyphenated, nodestr) # replaced_str_dom = BeautifulSoup(replaced_str, 'html.parser') # node.string = replaced_str # node.string.replace_with(node.string) except Exception as e: print(_ERROR_PREF+'Replacement error with \033[1m'+word+'\033[0m | \033[1m'+word_hyphenated+"\033[0m") print(e) print(node.string) print('[//]') pass # add none breaking spaces nbspzr_before = ['»', '\!', '\?', ':', ';'] for char in nbspzr_before: nodestr = re.sub(r'(\w|>)\s('+char+')', r'\1 \2', nodestr) nbspzr_after = ['«'] for char in nbspzr_after: nodestr = re.sub(r'('+char+')\s(\w|<)', r'\1 \2', nodestr) # print(nodestr) # replace node by hyphenated one node.replace_with(nodestr) if __name__ == "__main__": main()