build.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # @Author: Bachir Soussi Chiadmi <bach>
  4. # @Date: 27-03-2017
  5. # @Email: bachir@figureslibres.io
  6. # @Last modified by: bach
  7. # @Last modified time: 21-04-2017
  8. # @License: GPL-V3
  9. import sys, os, shutil
  10. import markdown
  11. # import mistune
  12. from bs4 import BeautifulSoup
  13. import pypandoc
  14. import json
  15. import re
  16. import pyphen
  17. _BOOK_SRC = 'book-src'
  18. _BUILD_d = "build"
  19. # CUR_PATH = os.path.dirname(os.path.abspath(__file__))
  20. # hyphenator
  21. _H_FR = pyphen.Pyphen(lang='fr')
  22. _ERROR_PREF = '\033[1m[!!]\033[0m '
  23. print("Building book")
  24. def main():
  25. # clean build directory
  26. if not os.path.isdir(_BUILD_d):
  27. # shutil.rmtree(_BUILD_d, ignore_errors=True)
  28. os.mkdir(_BUILD_d)
  29. if os.path.isdir(os.path.join(_BUILD_d, 'assets')):
  30. shutil.rmtree(os.path.join(_BUILD_d, 'assets'), ignore_errors=True)
  31. if os.path.isfile(os.path.join(_BUILD_d, 'stories.html')):
  32. shutil.rmtree(os.path.join(_BUILD_d, 'stories.html'), ignore_errors=True)
  33. print('Hyphen has fr language')
  34. print('fr' in pyphen.LANGUAGES)
  35. parse_book(_BOOK_SRC)
  36. def parse_book(book):
  37. # book_name = book.replace('.git', '')
  38. # print("- - -")
  39. print("Parse book")
  40. # print("- - -")
  41. # table of content (ordered list of markdown files)
  42. sum_p = os.path.join(_BOOK_SRC, "SUMMARY.md")
  43. if not os.path.isfile(sum_p):
  44. print("No summary file, can't generate html")
  45. return
  46. sum_f = open(sum_p)
  47. sum_str = sum_f.read()
  48. # print(sum_str)
  49. # convert md to html
  50. sum_html = markdown.markdown(sum_str)
  51. # print(sum_html)
  52. # create dom from html string (as it will be parsable)
  53. sum_dom = BeautifulSoup(sum_html, 'html.parser')
  54. # print(sum_dom)
  55. # parse html dom to get file list in the right order
  56. toc = parse_summary(sum_dom.ul, {})
  57. # print(toc)
  58. # generate final html build for html2print
  59. generate_html(book, toc)
  60. def parse_summary(ul, toc):
  61. print("Parse summary")
  62. i=0
  63. for li in ul.find_all('li',recursive=False):
  64. # print('li')
  65. for a in li.find_all('a',recursive=False):
  66. # print(a.get_text(strip=True))
  67. # print(a['href'])
  68. href = a['href']
  69. href = re.sub(r'^/', '', href)
  70. toc[i] = {
  71. 'label':a.get_text(strip=True),
  72. 'file':href
  73. }
  74. i = i+1
  75. return toc
  76. def generate_html(book, toc):
  77. print("Generate html")
  78. #
  79. # create main html dom from template
  80. template_f = open("templates/main.tpl.html", "r")
  81. template_html = template_f.read()
  82. template_dom = BeautifulSoup(template_html, 'html.parser')
  83. # replace title
  84. # template_dom.html.head.title.contents[0].replaceWith(book_name)
  85. # get story div
  86. story_dom = template_dom.find('div', {"id":"flow-main"})
  87. # loop through pages to convert them to html and add it to main html file
  88. pi = 0
  89. for page in toc:
  90. # print(toc[p])
  91. pagename = toc[page]['label']
  92. pageid = re.sub('[^a-z0-9]+', '-', pagename.lower())
  93. print("\033[92m"+pageid+"\033[0m")
  94. # files
  95. in_f = os.path.join(_BOOK_SRC, toc[page]['file'])
  96. if not os.path.isfile(in_f):
  97. print(_ERROR_PREF+"Source path is not a file, can't generate html : "+in_f)
  98. continue
  99. # print('in_f : '+in_f)
  100. # out_f = os.path.join(book_build_d_pages, toc[p]['file'].replace('/', '-').replace('.md', '.html'))
  101. # print('out_f : '+out_f)
  102. pdoc_args = ['--mathjax',
  103. '--smart']
  104. pdoc_filters = []
  105. output = pypandoc.convert_file(in_f,
  106. to='html5',
  107. format='markdown+header_attributes+link_attributes+bracketed_spans',
  108. extra_args=pdoc_args,
  109. filters=pdoc_filters)
  110. # outputfile=out_f)
  111. # print("output :\n"+output)
  112. output_dom = BeautifulSoup(output, 'html.parser')
  113. # print("output_dom :")
  114. # print(output_dom)
  115. # hyphenate paragraphes
  116. for node in output_dom.find_all('p'):
  117. hyphenate(node)
  118. # print(str(output_dom))
  119. # copy images
  120. for img in output_dom.find_all('img'):
  121. # print('-- img ',img)
  122. att_src = re.sub(r"^\/", "", img['src'])
  123. img['src'] = att_src
  124. # domimg = output_dom.find('img', {'src':img['src']})
  125. # domimg['src'] = att_src
  126. # print(domimg)
  127. src_img = os.path.join(_BOOK_SRC, att_src)
  128. # print('- - '+src_img)
  129. if not os.path.isfile(src_img):
  130. print(_ERROR_PREF+"Source path is not a file, can't copy img : \033[1m"+src_img+"\033[0m")
  131. continue
  132. dest_img = os.path.join(_BUILD_d, att_src)
  133. # print('- - '+dest_img)
  134. dest_path, dest_file = os.path.split(dest_img)
  135. if not os.path.isdir(dest_path):
  136. os.makedirs(dest_path)
  137. shutil.copyfile(src_img, dest_img)
  138. # append html story page to template_dom
  139. story_page = BeautifulSoup('<div class="story-page story-page-'+str(pi)+'" id="'+pageid+'"></div>', 'html.parser')
  140. story_page.div.append(output_dom)
  141. story_dom.append(story_page)
  142. pi = pi+1
  143. # create main html file from filled template html dom
  144. book_html_f = os.path.join(_BUILD_d,'stories.html')
  145. with open(book_html_f, 'w') as fp:
  146. fp.write(template_dom.prettify(formatter=None))
  147. def hyphenate(node):
  148. # print("hyphenate")
  149. nodetext = node.get_text()
  150. # print(nodetext)
  151. nodestr = str(node)
  152. # print(nodestr)
  153. for word in nodetext.split(' '):
  154. # do not hyphenate if it's not a real word
  155. if len(word) < 5 or re.search('\w+', word) == None:
  156. continue
  157. # cleaning word
  158. # remove all non-alphanumerical characteres duplicated or more
  159. word = re.sub('\W{2,}', '', word)
  160. # remove all non-alphanumerical at the begining of word
  161. word = re.sub('^\W', '', word)
  162. # remove all non-alphanumerical at the end of word
  163. word = re.sub('\W$', '', word)
  164. # remove all word remaing having special chars
  165. if re.search('\W+', word):
  166. continue
  167. # hyphenate word
  168. word_hyphenated = _H_FR.inserted(word)
  169. # remove hyphen precedeted by less than 3 letters
  170. word_hyphenated = re.sub(r'^(\w{,2})-', r'\1', word_hyphenated)
  171. # remove hyphen followed by less than 3 letters
  172. word_hyphenated = re.sub(r'-(\w{,2})$', r'\1', word_hyphenated)
  173. # replace scores by html elemt &shy;
  174. word_hyphenated = re.sub(r'(\w)-(\w)', r'\1&shy;\2', word_hyphenated)
  175. # replace double scores by score+$shy;
  176. word_hyphenated = re.sub(r'--', r'-&shy;', word_hyphenated)
  177. # TODO: attention au date 1950-1960, le tiret disparait
  178. # print(word_hyphenated)
  179. if re.search('\b+', word):
  180. print(word+" | "+word_hyphenated)
  181. try:
  182. # replace word by hyhanated_word on source
  183. nodestr = re.sub(word, word_hyphenated, nodestr)
  184. # replaced_str_dom = BeautifulSoup(replaced_str, 'html.parser')
  185. # node.string = replaced_str
  186. # node.string.replace_with(node.string)
  187. except Exception as e:
  188. print(_ERROR_PREF+'Replacement error with \033[1m'+word+'\033[0m | \033[1m'+word_hyphenated+"\033[0m")
  189. print(e)
  190. print(node.string)
  191. print('[//]')
  192. pass
  193. # add none breaking spaces
  194. nbspzr_before = ['»', '\!', '\?', ':', ';']
  195. for char in nbspzr_before:
  196. nodestr = re.sub(r'(\w|>)\s('+char+')', r'\1&nbsp;\2', nodestr)
  197. nbspzr_after = ['«']
  198. for char in nbspzr_after:
  199. nodestr = re.sub(r'('+char+')\s(\w|<)', r'\1&nbsp;\2', nodestr)
  200. # print(nodestr)
  201. # replace node by hyphenated one
  202. node.replace_with(nodestr)
  203. if __name__ == "__main__":
  204. main()