most_frequent_trigrams~20171216-080036.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. from collections import Counter
  2. import nltk
  3. import re
  4. import pickle
  5. # VARIABLES
  6. source = open("../data/1984_fragment.txt", "r")
  7. destination = open("../data/1984_fragment_trigrams.txt", "w")
  8. destination.write("OBAMA\S MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n")
  9. # FUNCTIONS
  10. ## sort words by frequency (import module)
  11. def sort_dict(frequency_d):
  12. c=Counter(frequency_d)
  13. frequency = c.most_common()
  14. return frequency
  15. ## MAKE SURE ALL VARIABLES ARE DECLARED WITHIN THE LOOPS
  16. # 1. Create dictionary of trigrams
  17. trigrams = {}
  18. for line in source:
  19. # remove punctuation
  20. clean_tri = []
  21. words = line.split(" ")
  22. for word in words:
  23. cleaning = re.compile(r"[A-Za-z0-9]")
  24. if cleaning.match(word):
  25. clean_tri.append(word)
  26. else:
  27. pass
  28. # find trigrams
  29. tricount = nltk.trigrams(clean_tri)
  30. # count frequency of each trigram and add trigram + value in dictionary
  31. for trigram in tricount:
  32. if trigram in trigrams:
  33. trigrams[trigram] += 1
  34. else:
  35. trigrams[trigram] = 1
  36. trigrams_sorted = sort_dict(trigrams)
  37. first10pairs = trigrams_sorted[:10]
  38. with destination as text:
  39. for tri, frequency in first10pairs:
  40. text.write("{} : {} \n".format(tri, frequency))