python2_write_parts_of_speech~20171216-080828.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # this script works in Python2
  4. from __future__ import division
  5. import nltk
  6. from pattern.en import tag
  7. import nltk.data
  8. from random import shuffle, choice
  9. # VARIABLES
  10. # texts
  11. source = open("../data/1984_fragment.txt", "r")
  12. destination = open("../data/1984_fragment_pos.txt", "wt")
  13. destination.write("1984\S SYNTAX using PENN'S TREEBANK\n\n")
  14. # FUNCTIONS
  15. ## SCRIPT
  16. # select 1 or more sentences from source
  17. ## split source text into list of sentences
  18. finding_sentences = nltk.data.load('tokenizers/punkt/english.pickle')
  19. sentences_list = []
  20. with source as text0:
  21. for line in text0:
  22. # this returns a list with 1 element containing the entire text, sentences separated by \n
  23. sentences = '\n'.join(finding_sentences.tokenize(line.decode('utf-8').strip()))
  24. # transform string into list of sentences
  25. sentences_list = sentences.split("\n")
  26. print("sentences list", sentences_list)
  27. with destination as text1:
  28. for sentence in sentences_list:
  29. # create tuple of tuples with pairs of word + POS-tag
  30. collection = tag(sentence, tokenize=True, encoding='utf-8')
  31. # transform tuple into list to be able to manipulate it
  32. collection = list(collection)
  33. for element in collection:
  34. text1.write(element[1] + " ")