summaryrefslogtreecommitdiffstats
path: root/src/emailinfo.py
blob: 95fc1fc3a91d9966d9535c9eb8a9e12b15ae4253 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-

import re
import ipaddr
import email.utils
import datetime
from weblookuptools import *

class EmailInfo:
	__location = None
	ipRegex = re.compile(r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)")
	ipInHostRegex = re.compile(r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)-){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)")
	hostRegex = re.compile(generateHostRegex())
	def __init__(self, ip = "", date = 0, debug = "", path = ""):
		self.ip = ip
		self.date = date
		self.debug = debug
		self.path = path
	def location(self):
		if self.__location != None:
			return self.__location
		self.__location = iplocation(self.ip)
		return self.__location
	def __cmp__(self, other):
		if self.date > other.date:
			return 1
		if self.date < other.date:
			return -1
		return 0
	@classmethod
	def parseFile(cls, fileName):
		file = open(fileName)
		receiveHosts = []
		date = False
		for line in file:
			line = line.strip()
			if line.startswith("Received:"): #Only look at Received lines
				filteredIps = []
				#After the "by" or "for" there is no useful information
				byIndex = line.find("by")
				forIndex = line.find("for")
				if forIndex != -1 and forIndex < byIndex:
					byIndex = forIndex
				if byIndex != -1:
					line = line[:byIndex]
				ips = EmailInfo.ipRegex.findall(line)
				hosts = EmailInfo.hostRegex.findall(line)
				#Some host names have IPs embedded in them, like dyn-23.54.128.44.columbia.edu
				for host in hosts:
					ipInHosts = EmailInfo.ipInHostRegex.findall(host)
					if len(ipInHosts) > 0:
						ips.insert(0, ipInHosts[0].replace("-", "."))
						hosts.remove(host)
				#Filter out non-internet IPs using Google's library
				for ip in ips:
					if EmailInfo.isInternetIP(ip):
						filteredIps.append(ip)
				#We seperate the hosts from the ips because we give more importance to IPs, since host resolution may have changed over the years.
				if len(hosts) > 0 or len(filteredIps) > 0:
					receiveHosts.append([hosts, filteredIps])
			elif line.startswith("Date:") and not date:
				try:
					date = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(email.utils.parsedate_tz(line[6:])))
				except:
					date = False
		file.close()
		#Earlier lines come later in the file, so we look at the file reversed
		receiveHosts.reverse()
		for i in range(len(receiveHosts)):
			if len(receiveHosts[i][0]) > 0 or len(receiveHosts[i][1]) > 0:
				#Resolve all hostnames to IP
				for j in range(len(receiveHosts[i][0])):
					try:
						ip = ipFromHost(receiveHosts[i][0][j])
						if isInternetIP(ip):
							receiveHosts[i][0][j] = ip
						else:
							receiveHosts[i][0][j] = ''
					except:
						receiveHosts[i][0][j] = ''
				#Remove non-resolivng hostnames
				while receiveHosts[i][0].count('') > 0:
					receiveHosts[i][0].remove('')
				#Remove duplicate IPs
				for j in range(len(receiveHosts[i])):
					for host in receiveHosts[i][j]:
						while receiveHosts[i][j].count(host) > 1:
							receiveHosts[i][j].remove(host)
				ip = False
				debug = False
				if i == 0 and (len(receiveHosts[i][0]) == 1 or len(receiveHosts[i][1]) == 1):
					pass
				elif len(receiveHosts[i][1]) > 1:
					debug = "MANY IP: %s, hit %i" % (receiveHosts[i][1], (i + 1))
				elif len(receiveHosts[i][1]) == 0 and len(receiveHosts[i][0]) > 1:
					debug = "MANY DOMAIN: %s, hit %i" % (receiveHosts[i][0], (i + 1))
				elif len(receiveHosts[i][1]) == 1 and i > 0:
					debug = "IP HIT ON %i: %s" % ((i + 1), receiveHosts[i][1][0])
				elif len(receiveHosts[i][0]) == 1 and i > 0:
					debug = "HOST HIT ON %i: %s" % ((i + 1), receiveHosts[i][0][0])
				#Choose last IP, preferring IPs to resolved hosts
				if len(receiveHosts[i][1]) > 0:
					ip = receiveHosts[i][1][len(receiveHosts[i][1]) - 1]
				elif len(receiveHosts[i][0]) > 0:
					ip = receiveHosts[i][0][len(receiveHosts[i][0]) - 1]
				if ip:
					return cls(ip, date, debug, fileName)
		raise RuntimeError("Unable to parse %s" % fileName)
	@staticmethod
	def isInternetIP(ip):
		ipObj = ipaddr.IPv4(ip)
		return ip != "0.0.0.0" and not ipObj.IsLoopback() and not ipObj.IsLinkLocal() and not ipObj.IsMulticast() and not ipObj.IsRFC1918()