Browse Source

added hyphenation to md2html feature

Bachir Soussi Chiadmi 6 years ago
parent
commit
08361ddc99
1 changed files with 82 additions and 1 deletions
  1. 82 1
      libriis/classes/md2html.py

+ 82 - 1
libriis/classes/md2html.py

@@ -17,6 +17,7 @@ from PyQt5.QtCore import QFileSystemWatcher
 import json
 from bs4 import BeautifulSoup
 import pypandoc
+import pyphen
 
 
 class Compiler():
@@ -57,6 +58,10 @@ class Compiler():
    def compileContents(self):
       print('Compiling md')
 
+      # hyphenator
+      self._H_FR = pyphen.Pyphen(lang='fr')
+      # TODO: add to settings language choice
+
       # create main html dom from template
       template_f = open(os.path.join(self.core.appcwd,"templates/main.tpl.html"), "r")
       template_html = template_f.read()
@@ -92,7 +97,9 @@ class Compiler():
 
          output_dom = BeautifulSoup(output, 'html.parser')
 
-         # TODO: hyphenate paragraph
+         # hyphenate paragraphes
+         for node in output_dom.find_all('p'):
+            self.hyphenate(node)
 
          # append html story page to template_dom
          story_page = BeautifulSoup(
@@ -109,3 +116,77 @@ class Compiler():
       book_html_f = os.path.join(self.core.cwd,'index.html')
       with open(book_html_f, 'w') as fp:
          fp.write(template_dom.prettify())
+
+
+   #     __  __            __
+   #    / / / /_  ______  / /_  ___  ____  _____
+   #   / /_/ / / / / __ \/ __ \/ _ \/ __ \/ ___/
+   #  / __  / /_/ / /_/ / / / /  __/ / / (__  )
+   # /_/ /_/\__, / .___/_/ /_/\___/_/ /_/____/
+   #       /____/_/
+   def hyphenate(self, node):
+      # print("hyphenate")
+      nodetext = node.get_text()
+      # print(nodetext)
+      nodestr = str(node)
+      # print(nodestr)
+      for word in nodetext.split(' '):
+
+         # do not hyphenate if it's not a real word
+         if len(word) < 5 or re.search('\w+', word) == None:
+            continue
+
+         # cleaning word
+         # remove all non-alphanumerical characteres duplicated or more
+         word = re.sub('\W{2,}', '', word)
+         # remove all non-alphanumerical at the begining of word
+         word = re.sub('^\W', '', word)
+         # remove all non-alphanumerical at the end of word
+         word = re.sub('\W$', '', word)
+
+         # remove all word remaing having special chars
+         if re.search('\W+', word):
+            continue
+
+         # hyphenate word
+         word_hyphenated = self._H_FR.inserted(word)
+         # remove hyphen precedeted by less than 3 letters
+         word_hyphenated = re.sub(r'^(\w{,2})-', r'\1', word_hyphenated)
+         # remove hyphen followed by less than 3 letters
+         word_hyphenated = re.sub(r'-(\w{,2})$', r'\1', word_hyphenated)
+         # replace scores by html elemt &shy;
+         word_hyphenated = re.sub(r'(\w)-(\w)', r'\1&shy;\2', word_hyphenated)
+         # replace double scores by score+$shy;
+         word_hyphenated = re.sub(r'--', r'-&shy;', word_hyphenated)
+         # TODO: attention au date 1950-1960, le tiret disparait
+
+         # print(word_hyphenated)
+
+         if re.search('\b+', word):
+            print(word+" | "+word_hyphenated)
+
+         try:
+            # replace word by hyhanated_word on source
+            nodestr = re.sub(word, word_hyphenated, nodestr)
+            # replaced_str_dom = BeautifulSoup(replaced_str, 'html.parser')
+            # node.string = replaced_str
+            # node.string.replace_with(node.string)
+         except Exception as e:
+            print(_ERROR_PREF+'Replacement error with \033[1m'+word+'\033[0m | \033[1m'+word_hyphenated+"\033[0m")
+            print(e)
+            print(node.string)
+            print('[//]')
+            pass
+
+      # add none breaking spaces
+      nbspzr_before = ['»', '\!', '\?', ':', ';']
+      for char in nbspzr_before:
+         nodestr = re.sub(r'(\w|>)\s('+char+')', r'\1&nbsp;\2', nodestr)
+
+      nbspzr_after = ['«']
+      for char in nbspzr_after:
+         nodestr = re.sub(r'('+char+')\s(\w|<)', r'\1&nbsp;\2', nodestr)
+
+      # print(nodestr)
+      # replace node by hyphenated one
+      node.replace_with(nodestr)