stop_words~20171216-161732.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # Copyright (C) 2016 Constant, Algolit
  4. # This program is free software: you can redistribute it and/or modify
  5. # it under the terms of the GNU General Public License as published by
  6. # the Free Software Foundation, either version 3 of the License, or
  7. # (at your option) any later version.
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details: <http://www.gnu.org/licenses/>.
  12. '''
  13. Input texts are checked against occurences of certain words included in a list of "stopwords" established by NLTK (Natural Language Toolkit). These words are then removed.
  14. In data mining, text processing and machine learning, these so-called high frequency words are filtered out before or after natural language data is processed.
  15. Relational words such as 'the', 'is', 'at', 'which', and 'on' are considered redundant because they are too frequent, and meaningless once the word order is removed.
  16. '''
  17. # A set is a list with unique words
  18. stopwords = set()
  19. # define list of filtered words
  20. filtered_words = []
  21. # read stopwords from file & save them in a list
  22. # read from file
  23. with open("english.txt", "r") as source:
  24. # for each line
  25. for line in source:
  26. # clean returns
  27. line = line.strip()
  28. # add word to set stopwords (cfr difference with list: list.append())
  29. stopwords.add(line)
  30. # define your sentence / string
  31. sentence = 'I was at Synesthésie last night and took a bus to go home.'
  32. # print sentence
  33. print("phrase originale:", sentence)
  34. # convert string to list of words
  35. words = sentence.split(" ")
  36. # for each word of list, check if word is in stopwords, if it isn't, add word to filtered wordlist
  37. for word in words:
  38. if word not in stopwords:
  39. filtered_words.append(word)
  40. # this is the same, but shorter + no need to declare filtered_words as list in the beginning:
  41. #filtered_words = [word for word in words if word not in stopwords]
  42. # turn wordlist into string of characters
  43. new_sentence = " ".join(filtered_words)
  44. # print new sentence
  45. print("phrase réécrite:", new_sentence)