import mailbox # pip install mailbox
import json
MBOX = 'resources/ch07-mailboxes/data/northpole.mbox'
# A routine that makes a ton of simplifying assumptions
# about converting an mbox message into a Python object
# given the nature of the northpole.mbox file in order
# to demonstrate the basic parsing of an mbox with mail
# utilities
def objectify_message(msg):
# Map in fields from the message
o_msg = dict([ (k, v) for (k,v) in msg.items() ])
# Assume one part to the message and get its content
# and its content type
part = [p for p in msg.walk()][0]
o_msg['contentType'] = part.get_content_type()
o_msg['content'] = part.get_payload()
return o_msg
# Create an mbox that can be iterated over and transform each of its
# messages to a convenient JSON representation
mbox = mailbox.mbox(MBOX)
messages = []
for msg in mbox:
messages.append(objectify_message(msg))
print(json.dumps(messages, indent=1))
import sys
from urllib.request import urlopen
import time
import os
import envoy # pip install envoy
URL = "http://www.cs.cmu.edu/~enron/enron_mail_20110402.tgz"
DOWNLOAD_DIR = "resources/ch07-mailboxes/data"
# Downloads a file and displays a download status every 5 seconds
def download(url, download_dir):
file_name = url.split('/')[-1]
u = urlopen(url)
f = open(os.path.join(download_dir, file_name), 'wb')
meta = u.info()
file_size = int(meta['Content-Length'])
print("Downloading: %s Bytes: %s" % (file_name, file_size))
file_size_dl = 0
block_sz = 8192
last_update = time.time()
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
download_status = r"%10d MB [%5.2f%%]" % (file_size_dl / 1000000.0, file_size_dl * 100.0 / file_size)
download_status = download_status + chr(8)*(len(download_status)+1)
if time.time() - last_update > 5:
print(download_status)
sys.stdout.flush()
last_update = time.time()
f.close()
return f.name
# Extracts a gzipped tarfile. e.g. "$ tar xzf filename.tgz"
def tar_xzf(f):
# Call out to the shell for a faster decompression.
# This will still take a while because Vagrant synchronizes
# thousands of files that are extracted to the host machine
r = envoy.run("tar xzf %s -C %s" % (f, DOWNLOAD_DIR))
print(r.std_out)
print(r.std_err)
f = download(URL, DOWNLOAD_DIR)
print("Download complete: %s" % (f,))
tar_xzf(f)
print("Decompression complete")
print("Data is ready")
The results of the sample code below have been saved as a file, enron.mbox.bz2
, in a compressed format. You may decompress is to enron.mbox
using whatever tool you prefer, appropriate to your computer's operating system. On UNIX-like systems, the file may be decompressed with the command:
tar -xjf enron.mbox.bz2
import re
import email
from time import asctime
import os
import sys
from dateutil.parser import parse # pip install python_dateutil
# XXX: Download the Enron corpus to resources/ch07-mailboxes/data
# and unarchive it there.
MAILDIR = 'resources/ch07-mailboxes/data/enron_mail_20110402/maildir'
# Where to write the converted mbox
MBOX = 'resources/ch07-mailboxes/data/enron.mbox'
# Create a file handle that we'll be writing into...
mbox = open(MBOX, 'w+')
# Walk the directories and process any folder named 'inbox'
for (root, dirs, file_names) in os.walk(MAILDIR):
if root.split(os.sep)[-1].lower() != 'inbox':
continue
# Process each message in 'inbox'
for file_name in file_names:
file_path = os.path.join(root, file_name)
message_text = open(file_path, errors='ignore').read()
# Compute fields for the From_ line in a traditional mbox message
_from = re.search(r"From: ([^\r\n]+)", message_text).groups()[0]
_date = re.search(r"Date: ([^\r\n]+)", message_text).groups()[0]
# Convert _date to the asctime representation for the From_ line
_date = asctime(parse(_date).timetuple())
msg = email.message_from_string(message_text)
msg.set_unixfrom('From {0} {1}'.format(_from, _date))
mbox.write(msg.as_string(unixfrom=True) + "\n\n")
mbox.close()
import pandas as pd # pip install pandas
import mailbox
MBOX = 'resources/ch07-mailboxes/data/enron.mbox'
mbox = mailbox.mbox(MBOX)
mbox_dict = {}
for i, msg in enumerate(mbox):
mbox_dict[i] = {}
for header in msg.keys():
mbox_dict[i][header] = msg[header]
mbox_dict[i]['Body'] = msg.get_payload().replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').strip()
df = pd.DataFrame.from_dict(mbox_dict, orient='index')
df.head()
df.index = df['Date'].apply(pd.to_datetime)
# Remove non-essential columns
cols_to_keep = ['From', 'To', 'Cc', 'Bcc', 'Subject', 'Body']
df = df[cols_to_keep]
df.head()
df.describe()
start_date = '2000-1-1'
stop_date = '2003-1-1'
datemask = (df.index > start_date) & (df.index <= stop_date)
vol_by_month = df.loc[datemask].resample('1M').count()['To']
print(vol_by_month)
from prettytable import PrettyTable
pt = PrettyTable(field_names=['Year', 'Month', 'Num Msgs'])
pt.align['Num Msgs'], pt.align['Month'] = 'r', 'r'
[ pt.add_row([ind.year, ind.month, vol])
for ind, vol in zip(vol_by_month.index, vol_by_month)]
print(pt)
import matplotlib.pyplot as plt
%matplotlib inline
vol_by_month[::-1].plot(kind='barh', figsize=(5,8), title='Email Volume by Month')
senders = df['From'].unique()
receivers = df['To'].unique()
cc_receivers = df['Cc'].unique()
bcc_receivers = df['Bcc'].unique()
print('Num Senders:', len(senders))
print('Num Receivers:', len(receivers))
print('Num CC Receivers:', len(cc_receivers))
print('Num BCC Receivers:', len(bcc_receivers))
senders = set(senders)
receivers = set(receivers)
cc_receivers = set(cc_receivers)
bcc_receivers = set(bcc_receivers)
# Find the number of senders who were also direct receivers
senders_intersect_receivers = senders.intersection(receivers)
# Find the senders that didn't receive any messages
senders_diff_receivers = senders.difference(receivers)
# Find the receivers that didn't send any messages
receivers_diff_senders = receivers.difference(senders)
# Find the senders who were any kind of receiver by
# first computing the union of all types of receivers
all_receivers = receivers.union(cc_receivers, bcc_receivers)
senders_all_receivers = senders.intersection(all_receivers)
print("Num senders in common with receivers:", len(senders_intersect_receivers))
print("Num senders who didn't receive:", len(senders_diff_receivers))
print("Num receivers who didn't send:", len(receivers_diff_senders))
print("Num senders in common with *all* receivers:", len(senders_all_receivers))
import numpy as np
top_senders = df.groupby('From')
top_receivers = df.groupby('To')
top_senders = top_senders.count()['To']
top_receivers = top_receivers.count()['From']
# Get the ordered indices of the top senders and receivers in descending order
top_snd_ord = np.argsort(top_senders)[::-1]
top_rcv_ord = np.argsort(top_receivers)[::-1]
top_senders = top_senders[top_snd_ord]
top_receivers = top_receivers[top_rcv_ord]
from prettytable import PrettyTable
top10 = top_senders[:10]
pt = PrettyTable(field_names=['Rank', 'Sender', 'Messages Sent'])
pt.align['Messages Sent'] = 'r'
[ pt.add_row([i+1, email, vol]) for i, email, vol in zip(range(10), top10.index.values, top10.values)]
print(pt)
from prettytable import PrettyTable
top10 = top_receivers[:10]
pt = PrettyTable(field_names=['Rank', 'Receiver', 'Messages Received'])
pt.align['Messages Sent'] = 'r'
[ pt.add_row([i+1, email, vol]) for i, email, vol in zip(range(10), top10.index.values, top10.values)]
print(pt)
import textwrap
search_term = 'raptor'
query = (df['Body'].str.contains(search_term, case=False) | df['Subject'].str.contains(search_term, case=False))
results = df[query]
print('{0} results found.'.format(query.sum()))
print('Printing first 10 results...')
for i in range(10):
subject, body = results.iloc[i]['Subject'], results.iloc[i]['Body']
print()
print('SUBJECT: ', subject)
print('-'*20)
for line in textwrap.wrap(body, width=70, max_lines=5):
print(line)
import httplib2
import os
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/gmail-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/gmail.readonly'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Gmail API Python Quickstart'
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth2 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
home_dir = os.path.expanduser('~')
credential_dir = os.path.join(home_dir, '.credentials')
if not os.path.exists(credential_dir):
os.makedirs(credential_dir)
credential_path = os.path.join(credential_dir,
'gmail-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('gmail', 'v1', http=http)
results = service.users().labels().list(userId='me').execute()
labels = results.get('labels', [])
if not labels:
print('No labels found.')
else:
print('Labels:')
for label in labels:
print(label['name'])
query = 'Mining'
max_results = 10
# Search for Gmail messages containing the query term
results = service.users().messages().list(userId='me', q=query, maxResults=max_results).execute()
for result in results['messages']:
print(result['id'])
# Retrieve the message itself
msg = service.users().messages().get(userId='me', id=result['id'], format='minimal').execute()
print(msg)