Talk:Machine Learning: Difference between revisions
Jump to navigation
Jump to search
(utility code to retrieve & decompress archived nb-discuss) |
|||
Line 1: | Line 1: | ||
== Feb. 27, 2014 == | == Feb. 27, 2014 == | ||
Folks met and hacked on the noisebridge discuss mailing list. We created a | Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, [[File:Py-piper-parser.txt]]. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon. | ||
== python to download and decompress nb-discuss archive == | |||
<pre> | |||
from StringIO import StringIO | |||
from gzip import GzipFile | |||
from time import gmtime | |||
from urllib import urlopen | |||
def decompress_from_url(u): | |||
# return GzipFile(fileobj = StringIO(urlopen(u).read())).read() | |||
f = urlopen(u) | |||
fs = StringIO(f.read()) | |||
g = GzipFile(fileobj = fs) | |||
s = g.read() | |||
for x in (f, fs, g): | |||
x.close() | |||
return s | |||
def discuss_gz_url(m, y): | |||
if m < 1 or m > 12: | |||
return None | |||
if y < 2007: | |||
return None | |||
now = gmtime() | |||
if (y > now.tm_year) or (y == now.tm_year and m > now.tm_mon): | |||
return None | |||
mm = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December') | |||
nb_pre = 'https://www.noisebridge.net/pipermail/noisebridge-discuss/' | |||
nb_post = '.txt.gz' | |||
s = '-'.join((str(y), mm[m-1])) | |||
return ''.join((nb_pre, s, nb_post)) | |||
def all_discuss_gz_urls(): | |||
now = gmtime() | |||
for y in range(2007, now.tm_year + 1): | |||
if y == 2007: | |||
mm = range(11, 12 + 1) # start with November 2007 | |||
elif y == now.tm_year: | |||
mm = range(1, now.tm_mon + 1) # end with current month | |||
else: | |||
mm = range(1, 13) | |||
for m in mm: | |||
yield discuss_gz_url(m, y) | |||
def discuss_a_month(month, year): | |||
u = discuss_gz_url(month, year) | |||
s = decompress_from_url(u) | |||
return s | |||
def spew(): | |||
for u in all_discuss_gz_urls(): | |||
yield decompress_from_url(u) | |||
def dump_uncompressed(filename="nb_wtf.txt"): | |||
with open(filename, "w") as f: | |||
for s in spew(): | |||
f.write(s) | |||
</pre> | |||
Revision as of 23:47, 1 March 2014
Feb. 27, 2014
Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, File:Py-piper-parser.txt. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon.
python to download and decompress nb-discuss archive
from StringIO import StringIO from gzip import GzipFile from time import gmtime from urllib import urlopen def decompress_from_url(u): # return GzipFile(fileobj = StringIO(urlopen(u).read())).read() f = urlopen(u) fs = StringIO(f.read()) g = GzipFile(fileobj = fs) s = g.read() for x in (f, fs, g): x.close() return s def discuss_gz_url(m, y): if m < 1 or m > 12: return None if y < 2007: return None now = gmtime() if (y > now.tm_year) or (y == now.tm_year and m > now.tm_mon): return None mm = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December') nb_pre = 'https://www.noisebridge.net/pipermail/noisebridge-discuss/' nb_post = '.txt.gz' s = '-'.join((str(y), mm[m-1])) return ''.join((nb_pre, s, nb_post)) def all_discuss_gz_urls(): now = gmtime() for y in range(2007, now.tm_year + 1): if y == 2007: mm = range(11, 12 + 1) # start with November 2007 elif y == now.tm_year: mm = range(1, now.tm_mon + 1) # end with current month else: mm = range(1, 13) for m in mm: yield discuss_gz_url(m, y) def discuss_a_month(month, year): u = discuss_gz_url(month, year) s = decompress_from_url(u) return s def spew(): for u in all_discuss_gz_urls(): yield decompress_from_url(u) def dump_uncompressed(filename="nb_wtf.txt"): with open(filename, "w") as f: for s in spew(): f.write(s)
Word parsing python script
Function 'get_words' takes list of dictionary of emails. Yields lists of words of in the message, for each message:
def get_words(lst): for d in lst: m = d['messageline'] yield m.split()
Plans to improve by using nltk[1]