counting_a_frequency_of_words~20171216-075722.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. #!/usr/bin/env/ python
  2. # This script creates a sorted frequency dictionary with stopwords.
  3. # Copyright (C) 2016 Constant, Algolit, An Mertens
  4. # This program is free software: you can redistribute it and/or modify
  5. # it under the terms of the GNU General Public License as published by
  6. # the Free Software Foundation, either version 3 of the License, or
  7. # (at your option) any later version.
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details: <http://www.gnu.org/licenses/>.
  12. from __future__ import division
  13. from collections import Counter
  14. import string
  15. from nltk.corpus import stopwords
  16. # VARIABLES
  17. # textfiles
  18. source1 = open('../data/1984_fragment.txt', 'rt')
  19. source2 = open('../data/verne_fragment.txt', 'rt')
  20. destination1 = open('../data/counting_1984.txt', 'wt')
  21. destination2 = open('../data/counting_verne.txt', 'wt')
  22. # FUNCTIONS
  23. # PREPROCESSING TEXT FILE
  24. ## remove caps + breaks + punctuation
  25. def remove_punct(f):
  26. tokens = (' '.join(line.replace('\n', '') for line in f)).lower()
  27. for c in string.punctuation:
  28. tokens= tokens.replace(c,"")
  29. tokens = tokens.strip()
  30. #print("tokens", type(tokens))
  31. return tokens
  32. ## create frequency dictionary
  33. def freq_dict(tokens):
  34. tokens = tokens.split(" ")
  35. frequency_d = {}
  36. # tokens = tokens.split(" ")
  37. for token in tokens:
  38. try:
  39. frequency_d[token] += 1
  40. except KeyError:
  41. frequency_d[token] = 1
  42. return frequency_d
  43. ## sort words by frequency (import module)
  44. def sort_dict(frequency_d):
  45. c=Counter(frequency_d)
  46. frequency = c.most_common()
  47. return frequency
  48. # write words in text file
  49. def write_to_file(frequency, g):
  50. for key, value in frequency:
  51. g.write(("{} : {} \n".format(key, value)))
  52. g.close()
  53. # Write new text into logbook
  54. def writetologbook(content):
  55. try:
  56. log = open(filename, "a")
  57. try:
  58. log.write(content)
  59. finally:
  60. log.close()
  61. except IOError:
  62. pass
  63. # SCRIPT
  64. # execute functions
  65. tokens1 = remove_punct(source1)
  66. tokens2 = remove_punct(source2)
  67. frequency_d1 = freq_dict(tokens1)
  68. frequency_d2 = freq_dict(tokens2)
  69. frequency1 = sort_dict(frequency_d1)
  70. frequency2 = sort_dict(frequency_d2)
  71. # Write in textfile
  72. write_to_file(frequency1, destination1)
  73. write_to_file(frequency2, destination2)
  74. source1.close()
  75. source2.close()
  76. destination1.close()
  77. destination2.close()