summaryrefslogtreecommitdiffstats
path: root/src/emailinfo.py
blob: 5ff17204c44d5bb981a86f93948b1d0c1b494847 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-

import re
import ipaddr
import email.utils
import datetime
from weblookuptools import *
from email.parser import Parser
from email.utils import getaddresses

class EmailInfo:
	__location = None
	ipRegex = re.compile(r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)")
	ipInHostRegex = re.compile(r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)-){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)")
	hostRegex = re.compile(generateHostRegex())
	parser = Parser()
	def __init__(self, ip = "", date = 0, debug = "", path = ""):
		self.ip = ip
		self.date = date
		self.debug = debug
		self.path = path
	def location(self):
		if self.__location != None:
			return self.__location
		self.__location = iplocation(self.ip)
		return self.__location
	def __cmp__(self, other):
		if self.date > other.date:
			return 1
		if self.date < other.date:
			return -1
		return 0
	@classmethod
	def parseFile(cls, fileName):
		file = open(fileName)
		msg = EmailInfo.parser.parse(file, True)
		receiveHosts = []
		date = False
		for line in msg.get_all("Received", []):
			filteredIps = []
			#After the "by" or "for" there is no useful information
			byIndex = line.find("by")
			forIndex = line.find("for")
			if forIndex != -1 and forIndex < byIndex:
				byIndex = forIndex
			if byIndex != -1:
				line = line[:byIndex]
			ips = EmailInfo.ipRegex.findall(line)
			hosts = EmailInfo.hostRegex.findall(line)
			#Some host names have IPs embedded in them, like dyn-23.54.128.44.columbia.edu
			for host in hosts:
				ipInHosts = EmailInfo.ipInHostRegex.findall(host)
				if len(ipInHosts) > 0:
					ips.insert(0, ipInHosts[0].replace("-", "."))
					hosts.remove(host)
			#Filter out non-internet IPs using Google's library
			for ip in ips:
				if EmailInfo.isInternetIP(ip):
					filteredIps.append(ip)
			#We seperate the hosts from the ips because we give more importance to IPs, since host resolution may have changed over the years.
			if len(hosts) > 0 or len(filteredIps) > 0:
				receiveHosts.append([hosts, filteredIps])
		try:
			date = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(email.utils.parsedate_tz(msg.get("Date"))))
		except:
			date = False
		#recipients = getaddresses(msg.get_all('to', []) + msg.get_all('cc', []) + msg.get_all('bcc', []) + msg.get_all('resent-to', []) + msg.get_all('resent-cc', []) + msg.get_all('resent-bcc', []))
		file.close()
		#Earlier lines come later in the file, so we look at the file reversed
		receiveHosts.reverse()
		for i in range(len(receiveHosts)):
			if len(receiveHosts[i][0]) > 0 or len(receiveHosts[i][1]) > 0:
				#Resolve all hostnames to IP
				for j in range(len(receiveHosts[i][0])):
					try:
						ip = ipFromHost(receiveHosts[i][0][j])
						if isInternetIP(ip):
							receiveHosts[i][0][j] = ip
						else:
							receiveHosts[i][0][j] = ''
					except:
						receiveHosts[i][0][j] = ''
				#Remove non-resolivng hostnames
				while receiveHosts[i][0].count('') > 0:
					receiveHosts[i][0].remove('')
				#Remove duplicate IPs
				for j in range(len(receiveHosts[i])):
					for host in receiveHosts[i][j]:
						while receiveHosts[i][j].count(host) > 1:
							receiveHosts[i][j].remove(host)
				ip = False
				debug = False
				if i == 0 and (len(receiveHosts[i][0]) == 1 or len(receiveHosts[i][1]) == 1):
					pass
				elif len(receiveHosts[i][1]) > 1:
					debug = "MANY IP: %s, hit %i" % (receiveHosts[i][1], (i + 1))
				elif len(receiveHosts[i][1]) == 0 and len(receiveHosts[i][0]) > 1:
					debug = "MANY DOMAIN: %s, hit %i" % (receiveHosts[i][0], (i + 1))
				elif len(receiveHosts[i][1]) == 1 and i > 0:
					debug = "IP HIT ON %i: %s" % ((i + 1), receiveHosts[i][1][0])
				elif len(receiveHosts[i][0]) == 1 and i > 0:
					debug = "HOST HIT ON %i: %s" % ((i + 1), receiveHosts[i][0][0])
				#Choose last IP, preferring IPs to resolved hosts
				if len(receiveHosts[i][1]) > 0:
					ip = receiveHosts[i][1][len(receiveHosts[i][1]) - 1]
				elif len(receiveHosts[i][0]) > 0:
					ip = receiveHosts[i][0][len(receiveHosts[i][0]) - 1]
				if ip:
					return cls(ip, date, debug, fileName)
		raise RuntimeError("Unable to parse %s" % fileName)
	@staticmethod
	def isInternetIP(ip):
		ipObj = ipaddr.IPv4(ip)
		return ip != "0.0.0.0" and not ipObj.IsLoopback() and not ipObj.IsLinkLocal() and not ipObj.IsMulticast() and not ipObj.IsRFC1918()