counting_a_frequency_of_words_grand_cru~20171216-075932.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. #!/usr/bin/env/ python
  2. # This script creates a sorted frequency dictionary with stopwords
  3. # Copyright (C) 2016 Constant, Algolit, An Mertens
  4. # This program is free software: you can redistribute it and/or modify
  5. # it under the terms of the GNU General Public License as published by
  6. # the Free Software Foundation, either version 3 of the License, or
  7. # (at your option) any later version.
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details: <http://www.gnu.org/licenses/>.
  12. from collections import Counter
  13. import string
  14. import nltk
  15. '''
  16. This script creates a frequency dictionary of words used in a text filtering out stopwords
  17. '''
  18. # VARIABLES
  19. # textfiles
  20. source1 = open('../data/1984_fragment.txt', 'rt')
  21. source2 = open('../data/verne_fragment.txt', 'rt')
  22. destination1 = open('../data/grand_cru_counting_1984.txt', 'wt')
  23. destination2 = open('../data/grand_cru_counting_verne.txt', 'wt')
  24. freqwords = ["the", "a", "to", "of", "in", 'is', "with", "on", "for", "at", "from", "about",\
  25. "are", "an", "up", "out", "have", "be", "this", "one", "says", "as", "all", "just", "was", "so", "there", "not", "by",\
  26. "into", "been", "dont", "has", "over", "doesnt", "did", "had", "would", "could", "didnt"]
  27. relationals = ["she", "you", "i", "he", "we", "her", "his", "it", "its", "their", "me", "our", 'they', "us", "my",\
  28. "your", "theyre", 'them', "youre", "him", "were", "these"]
  29. subphrases = ["and", "that", "but", "like", "what", "if", "then","theres", "or", "which", "who", "while", "where", "when",\
  30. "thats", "how", "because"]
  31. # removed "she", "you", "i", "he", "we",
  32. stopwords = ["the", "a", "to", "of", "in", 'is', "with", "on", "for", "at", "from", "about",\
  33. "are", "an", "up", "out", "have", "be", "this", "one", "says", "as", "all", "just", "was", "so",\
  34. "her", "his", "it", "its", "their", "me", "our",\
  35. "and", "that", "but", "like", "what", "if", "then", "there", "they", "us", "my", "your", "theres", "theyre", "or", "not",\
  36. "which", "by", "who", "them", "into", "while", "been", "dont", "where", "youre", "has", "when", "over", "him", "were", "doesnt",\
  37. "did", "thats", "how", "had", "these", "would", "could", "because", "didnt"]
  38. ## FUNCTIONS
  39. # PREPROCESSING TEXT FILE
  40. ## remove caps + breaks + punctuation
  41. def remove_punct(f):
  42. tokens = (' '.join(line.replace('\n', '') for line in f)).lower()
  43. for c in string.punctuation:
  44. tokens= tokens.replace(c,"")
  45. tokens = tokens.strip()
  46. #print("tokens", type(tokens))
  47. return tokens
  48. # remove stopwords
  49. def remove_stopwords(tokens):
  50. tokens = tokens.split(" ")
  51. words =[]
  52. for token in tokens:
  53. if token not in stopwords:
  54. words.append(token)
  55. return words
  56. ## create frequency dictionary
  57. def freq_dict(words):
  58. frequency_d = {}
  59. # tokens = tokens.split(" ")
  60. for word in words:
  61. try:
  62. frequency_d[word] += 1
  63. except KeyError:
  64. frequency_d[word] = 1
  65. return frequency_d
  66. ## sort words by frequency (import module)
  67. def sort_dict(frequency_d):
  68. c=Counter(frequency_d)
  69. frequency = c.most_common()
  70. return frequency
  71. # write words in text file
  72. def write_to_file(frequency, destination):
  73. for key, value in frequency:
  74. destination.write(("{} : {} \n".format(key, value)))
  75. destination.close()
  76. # Write new text into logbook
  77. def writetologbook(content):
  78. try:
  79. log = open(filename, "a")
  80. try:
  81. log.write(content)
  82. finally:
  83. log.close()
  84. except IOError:
  85. pass
  86. ## SCRIPT
  87. # execute functions
  88. tokens1 = remove_punct(source1)
  89. tokens2 = remove_punct(source2)
  90. words1 = remove_stopwords(tokens1)
  91. words2 = remove_stopwords(tokens2)
  92. frequency_d1 = freq_dict(words1)
  93. frequency_d2 = freq_dict(words2)
  94. frequency1 = sort_dict(frequency_d1)
  95. frequency2 = sort_dict(frequency_d2)
  96. write_to_file(frequency1, destination1)
  97. write_to_file(frequency2, destination2)
  98. source1.close()
  99. source2.close()
  100. destination1.close()
  101. destination2.close()
  102. # -------------------------------------------