123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 |
- from collections import Counter
- import nltk
- import re
- import pickle
- # VARIABLES
- source = open("../data/1984_fragment.txt", "r")
- destination = open("../data/1984_fragment_trigrams.txt", "w")
- destination.write("OBAMA\S MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n")
- # FUNCTIONS
- ## sort words by frequency (import module)
- def sort_dict(frequency_d):
- c=Counter(frequency_d)
- frequency = c.most_common()
- return frequency
- ## MAKE SURE ALL VARIABLES ARE DECLARED WITHIN THE LOOPS
- # 1. Create dictionary of trigrams
- trigrams = {}
- for line in source:
- # remove punctuation
- clean_tri = []
- words = line.split(" ")
- for word in words:
- cleaning = re.compile(r"[A-Za-z0-9]")
- if cleaning.match(word):
- clean_tri.append(word)
- else:
- pass
- # find trigrams
- tricount = nltk.trigrams(clean_tri)
- # count frequency of each trigram and add trigram + value in dictionary
- for trigram in tricount:
- if trigram in trigrams:
- trigrams[trigram] += 1
- else:
- trigrams[trigram] = 1
- trigrams_sorted = sort_dict(trigrams)
- first10pairs = trigrams_sorted[:10]
- with destination as text:
- for tri, frequency in first10pairs:
- text.write("{} : {} \n".format(tri, frequency))
|