bachir
/
ola5doc


			
							1234567891011121314151617181920212223242526272829303132333435363738
							#!/usr/bin/env/ python
# THIS SCRIPT ADAPTS THE SOURCE TEXT IN THE RIGHT READING FORMAT FOR THE ALGORITHM, CLEANING UP WHITE SPACES/SPLITTING INTO SENTENCES
# source text is written in uppercase
# remove white spaces, put everything in lowercase
# split on punctuation
# write in file capitalizing first letter

 #    Copyright (C) 2016 Constant, Algolit, An Mertens

 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, either version 3 of the License, or
 #    (at your option) any later version.

 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details: <http://www.gnu.org/licenses/>.


import nltk.data

# split into sentences
sentences = []
finding_sentences = nltk.data.load('tokenizers/punkt/english.pickle')
with open('../data/frankenstein_for_machines.txt', 'rt') as source:
	for line in source:
		# this returns a list with 1 element containing the entire text, sentences separated by \n
		sentences = '\n'.join(finding_sentences.tokenize(line.strip().lower().capitalize()))
		# transform string into list of sentences
		sentences = sentences.split("\n")

# write clean text to a file
with open("frankenstein_for_machines_tf.txt", "w") as destination:
	for sentence in sentences:
		destination.write(sentence.strip().capitalize()+" ")