#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright (C) 2016 Constant, Algolit # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details: . ''' Input texts are checked against occurences of certain words included in a list of "stopwords" established by NLTK (Natural Language Toolkit). These words are then removed. In data mining, text processing and machine learning, these so-called high frequency words are filtered out before or after natural language data is processed. Relational words such as 'the', 'is', 'at', 'which', and 'on' are considered redundant because they are too frequent, and meaningless once the word order is removed. ''' # A set is a list with unique words stopwords = set() # define list of filtered words filtered_words = [] # read stopwords from file & save them in a list # read from file # txt = source.readlines() with open("english.txt", "r") as source: # for each line for line in source: # clean returns line = line.strip() # add word to set stopwords (cfr difference with list: list.append()) stopwords.add(line) # define your sentence / string sentence = 'I was at Synesthésie last night and took a bus to go home.' # print sentence print("phrase originale:", sentence) # convert string to list of words words = sentence.split(" ") # for each word of list, check if word is in stopwords, if it isn't, add word to filtered wordlist for word in words: if word not in stopwords: filtered_words.append(word) # this is the same, but shorter + no need to declare filtered_words as list in the beginning: #filtered_words = [word for word in words if word not in stopwords] # turn wordlist into string of characters new_sentence = " ".join(filtered_words) # print new sentence print("phrase réécrite:", new_sentence)