adapting_the_reading_glasses~20171216-075426.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. #!/usr/bin/env/ python
  2. # THIS SCRIPT ADAPTS THE SOURCE TEXT IN THE RIGHT READING FORMAT FOR THE ALGORITHM, CLEANING UP WHITE SPACES/SPLITTING INTO SENTENCES
  3. # source text is written in uppercase
  4. # remove white spaces, put everything in lowercase
  5. # split on punctuation
  6. # write in file capitalizing first letter
  7. # Copyright (C) 2016 Constant, Algolit, An Mertens
  8. # This program is free software: you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation, either version 3 of the License, or
  11. # (at your option) any later version.
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details: <http://www.gnu.org/licenses/>.
  16. import nltk.data
  17. # split into sentences
  18. sentences = []
  19. finding_sentences = nltk.data.load('tokenizers/punkt/english.pickle')
  20. with open('../data/frankenstein_for_machines.txt', 'rt') as source:
  21. for line in source:
  22. # this returns a list with 1 element containing the entire text, sentences separated by \n
  23. sentences = '\n'.join(finding_sentences.tokenize(line.strip().lower().capitalize()))
  24. # transform string into list of sentences
  25. sentences = sentences.split("\n")
  26. # write clean text to a file
  27. with open("frankenstein_for_machines_tf.txt", "w") as destination:
  28. for sentence in sentences:
  29. destination.write(sentence.strip().capitalize()+" ")