perec~20171124-154738.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. #!/usr/bin/python
  2. # this is a shebang: https://en.wikipedia.org/wiki/Shebang_%28Unix%29
  3. '''
  4. This script looks at each word in a given text, if the word contains the letters of Perec, the word is printed to another textfile
  5. Made for OLA #5, Paris, 15-17 décembre 2017
  6. '''
  7. # import external modules
  8. import re
  9. import string
  10. # define textfiles
  11. source = open("../data/1984_all.txt", 'r')
  12. destination = open("../data/perec.txt", 'w')
  13. # define regular expression
  14. regex = r'(\w*p+\w*e+\w*r+\w*e+\w*c+)'
  15. # write title to destination
  16. destination.write("Source: George Orwell's 1984\n\n\n")
  17. # search for pattern in source, print in terminal & write to destination
  18. sentences = []
  19. # read source line by line
  20. for line in source:
  21. # split each line into list of words, split on white spaces
  22. words = line.split(" ")
  23. for word in words:
  24. # look if pattern is in word
  25. if re.search(regex, word):
  26. # if yes, print word in terminal
  27. print(word)
  28. # write word to file without punctuation
  29. destination.write(word.strip('\., \,')+'\n')
  30. # close textfiles
  31. source.close()
  32. destination.close()