| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 | 
							- from collections import Counter
 
- import nltk
 
- import re
 
- import pickle
 
- # VARIABLES
 
- source = open("../data/1984_fragment.txt", "r")
 
- destination = open("../data/1984_fragment_trigrams.txt", "w")
 
- destination.write("OBAMA\S MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n")
 
- # FUNCTIONS
 
- ## sort words by frequency (import module)
 
- def sort_dict(frequency_d):
 
- 	c=Counter(frequency_d)
 
- 	frequency = c.most_common()
 
- 	return frequency
 
- ## MAKE SURE ALL VARIABLES ARE DECLARED WITHIN THE LOOPS		
 
- # 1. Create dictionary of trigrams
 
- trigrams = {}
 
- for line in source:
 
- 	# remove punctuation
 
- 	clean_tri = []
 
- 	words = line.split(" ")
 
- 	for word in words:
 
- 		cleaning = re.compile(r"[A-Za-z0-9]")
 
- 		if cleaning.match(word):
 
- 			clean_tri.append(word)
 
- 		else:
 
- 			pass
 
- 	# find trigrams
 
- 	tricount = nltk.trigrams(clean_tri)	
 
- 	# count frequency of each trigram and add trigram + value in dictionary			
 
- 	for trigram in tricount:
 
- 		if trigram in trigrams:
 
- 			trigrams[trigram] += 1
 
- 		else:
 
- 			trigrams[trigram] = 1	
 
- trigrams_sorted = sort_dict(trigrams)
 
- first10pairs = trigrams_sorted[:10]
 
- with destination as text:
 
- 	for tri, frequency in first10pairs:
 
- 		text.write("{} : {} \n".format(tri, frequency))
 
 
  |