123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- #!/usr/bin/env/ python
- # This script creates a sorted frequency dictionary with stopwords
- # Copyright (C) 2016 Constant, Algolit, An Mertens
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details: <http://www.gnu.org/licenses/>.
- from collections import Counter
- import string
- import nltk
- '''
- This script creates a frequency dictionary of words used in a text filtering out stopwords
- '''
- # VARIABLES
- # textfiles
- source1 = open('../data/1984_fragment.txt', 'rt')
- source2 = open('../data/verne_fragment.txt', 'rt')
- destination1 = open('../data/grand_cru_counting_1984.txt', 'wt')
- destination2 = open('../data/grand_cru_counting_verne.txt', 'wt')
- freqwords = ["the", "a", "to", "of", "in", 'is', "with", "on", "for", "at", "from", "about",\
- "are", "an", "up", "out", "have", "be", "this", "one", "says", "as", "all", "just", "was", "so", "there", "not", "by",\
- "into", "been", "dont", "has", "over", "doesnt", "did", "had", "would", "could", "didnt"]
- relationals = ["she", "you", "i", "he", "we", "her", "his", "it", "its", "their", "me", "our", 'they', "us", "my",\
- "your", "theyre", 'them', "youre", "him", "were", "these"]
- subphrases = ["and", "that", "but", "like", "what", "if", "then","theres", "or", "which", "who", "while", "where", "when",\
- "thats", "how", "because"]
- # removed "she", "you", "i", "he", "we",
- stopwords = ["the", "a", "to", "of", "in", 'is', "with", "on", "for", "at", "from", "about",\
- "are", "an", "up", "out", "have", "be", "this", "one", "says", "as", "all", "just", "was", "so",\
- "her", "his", "it", "its", "their", "me", "our",\
- "and", "that", "but", "like", "what", "if", "then", "there", "they", "us", "my", "your", "theres", "theyre", "or", "not",\
- "which", "by", "who", "them", "into", "while", "been", "dont", "where", "youre", "has", "when", "over", "him", "were", "doesnt",\
- "did", "thats", "how", "had", "these", "would", "could", "because", "didnt"]
- ## FUNCTIONS
- # PREPROCESSING TEXT FILE
- ## remove caps + breaks + punctuation
- def remove_punct(f):
- tokens = (' '.join(line.replace('\n', '') for line in f)).lower()
- for c in string.punctuation:
- tokens= tokens.replace(c,"")
- tokens = tokens.strip()
- #print("tokens", type(tokens))
- return tokens
- # remove stopwords
- def remove_stopwords(tokens):
- tokens = tokens.split(" ")
- words =[]
- for token in tokens:
- if token not in stopwords:
- words.append(token)
- return words
- ## create frequency dictionary
- def freq_dict(words):
- frequency_d = {}
- # tokens = tokens.split(" ")
- for word in words:
- try:
- frequency_d[word] += 1
- except KeyError:
- frequency_d[word] = 1
- return frequency_d
- ## sort words by frequency (import module)
- def sort_dict(frequency_d):
- c=Counter(frequency_d)
- frequency = c.most_common()
- return frequency
- # write words in text file
- def write_to_file(frequency, destination):
- for key, value in frequency:
- destination.write(("{} : {} \n".format(key, value)))
- destination.close()
- # Write new text into logbook
- def writetologbook(content):
- try:
- log = open(filename, "a")
- try:
- log.write(content)
- finally:
- log.close()
- except IOError:
- pass
- ## SCRIPT
- # execute functions
- tokens1 = remove_punct(source1)
- tokens2 = remove_punct(source2)
- words1 = remove_stopwords(tokens1)
- words2 = remove_stopwords(tokens2)
- frequency_d1 = freq_dict(words1)
- frequency_d2 = freq_dict(words2)
- frequency1 = sort_dict(frequency_d1)
- frequency2 = sort_dict(frequency_d2)
- write_to_file(frequency1, destination1)
- write_to_file(frequency2, destination2)
- source1.close()
- source2.close()
- destination1.close()
- destination2.close()
- # -------------------------------------------
|