bachir
/
ola5doc


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
							#!/usr/bin/env/ python

# This script creates a sorted frequency dictionary with stopwords

 #    Copyright (C) 2016 Constant, Algolit, An Mertens

 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, either version 3 of the License, or
 #    (at your option) any later version.

 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details: <http://www.gnu.org/licenses/>.


from collections import Counter
import string
import nltk


'''
This script creates a frequency dictionary of words used in a text filtering out stopwords
'''

# VARIABLES


# textfiles
source1 = open('../data/1984_fragment.txt', 'rt')
source2 = open('../data/verne_fragment.txt', 'rt')
destination1 = open('../data/grand_cru_counting_1984.txt', 'wt')
destination2 = open('../data/grand_cru_counting_verne.txt', 'wt')


freqwords = ["the", "a", "to", "of", "in", 'is', "with", "on", "for", "at", "from", "about",\
 "are", "an", "up", "out", "have", "be", "this", "one", "says", "as", "all", "just", "was", "so", "there", "not", "by",\
 "into", "been", "dont", "has", "over", "doesnt", "did", "had", "would", "could", "didnt"]
relationals = ["she", "you", "i", "he", "we", "her", "his", "it", "its", "their", "me", "our", 'they', "us", "my",\
"your", "theyre", 'them', "youre", "him", "were", "these"]
subphrases = ["and", "that", "but", "like", "what", "if", "then","theres", "or", "which", "who", "while", "where", "when",\
"thats", "how", "because"]

# removed "she", "you", "i", "he", "we",
stopwords = ["the", "a", "to", "of", "in", 'is', "with", "on", "for", "at", "from", "about",\
 "are", "an", "up", "out", "have", "be", "this", "one", "says", "as", "all", "just", "was", "so",\
 "her", "his", "it", "its", "their", "me", "our",\
 "and", "that", "but", "like", "what", "if", "then", "there", "they", "us", "my", "your", "theres", "theyre", "or", "not",\
 "which", "by", "who", "them", "into", "while", "been", "dont", "where", "youre", "has", "when", "over", "him", "were", "doesnt",\
 "did", "thats", "how", "had", "these", "would", "could", "because", "didnt"]


## FUNCTIONS

# PREPROCESSING TEXT FILE
## remove caps + breaks + punctuation
def remove_punct(f):
	tokens = (' '.join(line.replace('\n', '') for line in f)).lower()
	for c in string.punctuation:
		tokens= tokens.replace(c,"")
		tokens = tokens.strip()
		#print("tokens", type(tokens))
	return tokens

# remove stopwords
def remove_stopwords(tokens):
	tokens = tokens.split(" ")
	words =[]
	for token in tokens:
		if token not in stopwords:
			words.append(token)
	return words

## create frequency dictionary 
def freq_dict(words):
	frequency_d = {}
	# tokens = tokens.split(" ")
	for word in words:
		try:
			frequency_d[word] += 1
		except KeyError:
			frequency_d[word] = 1
	return frequency_d

## sort words by frequency (import module)
def sort_dict(frequency_d):
	c=Counter(frequency_d)
	frequency = c.most_common()
	return frequency

# write words in text file 
def write_to_file(frequency, destination):
	for key, value in frequency:
		destination.write(("{} : {} \n".format(key, value)))
	destination.close()


# Write new text into logbook
def writetologbook(content):
    try:
        log = open(filename, "a")
        try:
            log.write(content)
        finally:
            log.close()
    except IOError:
        pass


## SCRIPT

# execute functions
tokens1 = remove_punct(source1)
tokens2 = remove_punct(source2)

words1 = remove_stopwords(tokens1)
words2 = remove_stopwords(tokens2)

frequency_d1 = freq_dict(words1)
frequency_d2 = freq_dict(words2)

frequency1 = sort_dict(frequency_d1)
frequency2 = sort_dict(frequency_d2)

write_to_file(frequency1, destination1)
write_to_file(frequency2, destination2)

source1.close()
source2.close()

destination1.close()
destination2.close()

# -------------------------------------------