Talk:Machine Learning: Difference between revisions

From Noisebridge
Jump to navigation Jump to search
(utility code to retrieve & decompress archived nb-discuss)
Line 1: Line 1:
== Feb. 27, 2014 ==
== Feb. 27, 2014 ==


Folks met and hacked on the noisebridge discuss mailing list. We created a 93MB text dump, and a python script to parse it, [[File:Py-piper-parser.txt]]. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon.
Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, [[File:Py-piper-parser.txt]]. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon.
 
== python to download and decompress nb-discuss archive ==
<pre>
from StringIO import StringIO
from gzip import GzipFile
from time import gmtime
from urllib import urlopen
 
def decompress_from_url(u):
  # return GzipFile(fileobj = StringIO(urlopen(u).read())).read()
  f = urlopen(u)
  fs = StringIO(f.read())
  g = GzipFile(fileobj = fs)
  s = g.read()
  for x in (f, fs, g):
    x.close()
  return s
 
def discuss_gz_url(m, y):
  if m < 1 or m > 12:
    return None
  if y < 2007:
    return None
  now = gmtime()
  if (y > now.tm_year) or (y == now.tm_year and m > now.tm_mon):
    return None
  mm = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December')
  nb_pre = 'https://www.noisebridge.net/pipermail/noisebridge-discuss/'
  nb_post = '.txt.gz'
  s = '-'.join((str(y), mm[m-1]))
  return ''.join((nb_pre, s, nb_post))
 
def all_discuss_gz_urls():
  now = gmtime()
  for y in range(2007, now.tm_year + 1):
    if y == 2007:
      mm = range(11, 12 + 1)  # start with November 2007
    elif y == now.tm_year:
      mm = range(1, now.tm_mon + 1)  # end with current month
    else:
      mm = range(1, 13)
    for m in mm:
      yield discuss_gz_url(m, y)
 
def discuss_a_month(month, year):
  u = discuss_gz_url(month, year)
  s = decompress_from_url(u)
  return s
 
def spew():
  for u in all_discuss_gz_urls():
    yield decompress_from_url(u)
 
def dump_uncompressed(filename="nb_wtf.txt"):
  with open(filename, "w") as f:
    for s in spew():
      f.write(s)
 
</pre>





Revision as of 23:47, 1 March 2014

Feb. 27, 2014

Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, File:Py-piper-parser.txt. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon.

python to download and decompress nb-discuss archive

from StringIO import StringIO
from gzip import GzipFile
from time import gmtime
from urllib import urlopen

def decompress_from_url(u):
  # return GzipFile(fileobj = StringIO(urlopen(u).read())).read()
  f = urlopen(u)
  fs = StringIO(f.read())
  g = GzipFile(fileobj = fs)
  s = g.read()
  for x in (f, fs, g):
    x.close()
  return s

def discuss_gz_url(m, y):
  if m < 1 or m > 12:
    return None
  if y < 2007:
    return None
  now = gmtime()
  if (y > now.tm_year) or (y == now.tm_year and m > now.tm_mon):
    return None
  mm = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December')
  nb_pre = 'https://www.noisebridge.net/pipermail/noisebridge-discuss/'
  nb_post = '.txt.gz'
  s = '-'.join((str(y), mm[m-1]))
  return ''.join((nb_pre, s, nb_post))

def all_discuss_gz_urls():
  now = gmtime()
  for y in range(2007, now.tm_year + 1):
    if y == 2007:
      mm = range(11, 12 + 1)  # start with November 2007
    elif y == now.tm_year:
      mm = range(1, now.tm_mon + 1)  # end with current month 
    else:
      mm = range(1, 13)
    for m in mm:
      yield discuss_gz_url(m, y)

def discuss_a_month(month, year):
  u = discuss_gz_url(month, year)
  s = decompress_from_url(u)
  return s

def spew():
  for u in all_discuss_gz_urls():
    yield decompress_from_url(u)

def dump_uncompressed(filename="nb_wtf.txt"):
  with open(filename, "w") as f:
    for s in spew():
      f.write(s)


Word parsing python script

Function 'get_words' takes list of dictionary of emails. Yields lists of words of in the message, for each message:

 def get_words(lst):
   for d in lst:
     m = d['messageline']
     yield m.split()

Plans to improve by using nltk[1]