#!/usr/bin/python # prints frequency of words found in standard input sorted by # frequency after removing stop words and stipping off punctuation import sys from operator import itemgetter from optparse import OptionParser word_count = 0 # read command line arguments and process parser = OptionParser() parser.add_option('-n', '--number', type="int", default=None, help='number of words to report') parser.add_option("-t", "--threshold", type="int", default=0, help="report words where frequency > threshold") (options, args) = parser.parse_args() punct = """'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'""" freq = {} # frequency of words in text # load stop words to ignore into a dictionary stop_words = {} for line in open("stop_words.txt"): stop_words[line.strip()] = True stop_words[''] = True # compute the frequencies for line in sys.stdin: for word in line.split(): word_count = word_count + 1 word = word.strip(punct).lower() if word not in stop_words: freq[word] = freq.get(word,0) + 1 # sord the words found by their frequency words = sorted(freq.iteritems(), key=itemgetter(1), reverse=True) print word_count # print the top option.number words but only those with freq>option.threshold for (word, freq) in words[:options.number]: if freq > options.threshold: print (float(freq)/word_count), word