#!/usr/bin/env python # -*- coding: iso-8859-1 -*- import re import ipaddr import email.utils import datetime from weblookuptools import * from email.parser import Parser from email.utils import getaddresses class EmailInfo: __location = None ipRegex = re.compile(r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)") ipInHostRegex = re.compile(r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)-){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)") hostRegex = re.compile(generateHostRegex()) parser = Parser() def __init__(self, ip = "", date = 0, debug = "", path = ""): self.ip = ip self.date = date self.debug = debug self.path = path def location(self): if self.__location != None: return self.__location self.__location = iplocation(self.ip) return self.__location def __cmp__(self, other): if self.date > other.date: return 1 if self.date < other.date: return -1 return 0 @classmethod def parseFile(cls, fileName): file = open(fileName) msg = EmailInfo.parser.parse(file, True) receiveHosts = [] date = False receivedLines = msg.get_all("Received", []) #Earlier lines come later in the file, so we look at the headers reversed receivedLines.reverse() for line in receivedLines: filteredIps = [] #After the "by" or "for" there is no useful information byIndex = line.find("by") forIndex = line.find("for") if forIndex != -1 and forIndex < byIndex: byIndex = forIndex if byIndex != -1: line = line[:byIndex] ips = EmailInfo.ipRegex.findall(line) hosts = EmailInfo.hostRegex.findall(line) #Some host names have IPs embedded in them, like dyn-23.54.128.44.columbia.edu for host in hosts: ipInHosts = EmailInfo.ipInHostRegex.findall(host) if len(ipInHosts) > 0: ips.insert(0, ipInHosts[0].replace("-", ".")) hosts.remove(host) #Filter out non-internet IPs using Google's library for ip in ips: if EmailInfo.isInternetIP(ip): filteredIps.append(ip) #We seperate the hosts from the ips because we give more importance to IPs, since host resolution may have changed over the years. if len(hosts) > 0 or len(filteredIps) > 0: receiveHosts.append([hosts, filteredIps]) try: date = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(email.utils.parsedate_tz(msg.get("Date")))) except: date = False #recipients = getaddresses(msg.get_all('to', []) + msg.get_all('cc', []) + msg.get_all('bcc', []) + msg.get_all('resent-to', []) + msg.get_all('resent-cc', []) + msg.get_all('resent-bcc', [])) file.close() for i in range(len(receiveHosts)): if len(receiveHosts[i][0]) > 0 or len(receiveHosts[i][1]) > 0: #Resolve all hostnames to IP for j in range(len(receiveHosts[i][0])): try: ip = ipFromHost(receiveHosts[i][0][j]) if isInternetIP(ip): receiveHosts[i][0][j] = ip else: receiveHosts[i][0][j] = '' except: receiveHosts[i][0][j] = '' #Remove non-resolivng hostnames while receiveHosts[i][0].count('') > 0: receiveHosts[i][0].remove('') #Remove duplicate IPs for j in range(len(receiveHosts[i])): for host in receiveHosts[i][j]: while receiveHosts[i][j].count(host) > 1: receiveHosts[i][j].remove(host) ip = False debug = False if i == 0 and (len(receiveHosts[i][0]) == 1 or len(receiveHosts[i][1]) == 1): pass elif len(receiveHosts[i][1]) > 1: debug = "MANY IP: %s, hit %i" % (receiveHosts[i][1], (i + 1)) elif len(receiveHosts[i][1]) == 0 and len(receiveHosts[i][0]) > 1: debug = "MANY DOMAIN: %s, hit %i" % (receiveHosts[i][0], (i + 1)) elif len(receiveHosts[i][1]) == 1 and i > 0: debug = "IP HIT ON %i: %s" % ((i + 1), receiveHosts[i][1][0]) elif len(receiveHosts[i][0]) == 1 and i > 0: debug = "HOST HIT ON %i: %s" % ((i + 1), receiveHosts[i][0][0]) #Choose last IP, preferring IPs to resolved hosts if len(receiveHosts[i][1]) > 0: ip = receiveHosts[i][1][len(receiveHosts[i][1]) - 1] elif len(receiveHosts[i][0]) > 0: ip = receiveHosts[i][0][len(receiveHosts[i][0]) - 1] if ip: return cls(ip, date, debug, fileName) raise RuntimeError("Unable to parse %s" % fileName) @staticmethod def isInternetIP(ip): ipObj = ipaddr.IPv4(ip) return ip != "0.0.0.0" and not ipObj.IsLoopback() and not ipObj.IsLinkLocal() and not ipObj.IsMulticast() and not ipObj.IsRFC1918()