import csv import logging import re from tldextract import tldextract import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy import Selector class EmailItem(scrapy.Item): page_url = scrapy.Field() link = scrapy.Field() email = scrapy.Field() name = scrapy.Field() title = scrapy.Field() class MoneySpider(CrawlSpider): name = 'qs-contact-mit' allowed_domains = [] file_CSV = open('companies.csv') data_CSV = csv.reader(file_CSV) list_CSV = list(data_CSV) start_urls = [] for url in start_urls: home_domain = tldextract.extract(url).domain + '.' + tldextract.extract(url).suffix allowed_domains.append(home_domain) accept_keywords = ''.join([ 'relations|partners|invest|investor|manager|managers|merchants|vendors|retailers|sellers|dispensaries|clinics|shareholder' ]) accept_xpath = ''.join([ '//a[contains(text(), "invest")', ' or contains(text(), "Invest")', ' or contains(text(), "relations")', ' or contains(text(), "Relations")', ' or contains(text(), "partners")', ' or contains(text(), "Partners")', ' or contains(text(), "sellers")', ' or contains(text(), "Sellers")', ' or contains(text(), "merchants")', ' or contains(text(), "Merchants")', ' or contains(text(), "vendors")', ' or contains(text(), "Vendors")', ' or contains(text(), "managers")', ' or contains(text(), "Managers")', ' or contains(text(), "investors")', ' or contains(text(), "Investors")', ']' ]) except_keywords_regex = ''.join([ '^https?://([^/]+/){5,}[^/]+/?$' #match url at least 6 directories ex. home/dir1/dir2/dir3/dir4/dir5/dir6 # '^https?://(\D*\d\D*){3,}$' ]) except_keywords = ''.join([ 'youtube|google|mozilla|facebook|twitter|instagram|linkedin|pinterest' # '|([^/]+/){2,}[^/]+/?|java[Ss]cript' ]) except_keywords_start = ''.join([ '\?|#|java[Ss]cript|\.\.|tel\:|phone\:|fax\:' ]) # common file extensions that are not followed if they occur in links IGNORED_EXTENSIONS = [ # images 'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif', 'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'webp', # audio 'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff', 'webm', # video '3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'mpeg', 'qt', 'rm', 'swf', 'wmv', 'flv', 'm4a', # document 'pdf', 'xls', 'xlsx', 'doc', 'docx', 'ppt', 'pps', 'pptx', 'pptm', 'ppsx', 'ppsm', 'sldx', 'sldm' 'xps', 'rtf', 'odt', 'ods', 'odp', 'odg', 'odf', # other 'css', 'exe', 'bin', 'rss', 'zip', 'rar', '7z', 'gz', 'bz2', 'tar' ] rules = ( # Extract links matching 'category.php' (but not matching 'subsection.php') # and follow links from them (since no callback means follow=True by default). # Rule(LinkExtractor(allow=('', ), deny=('subsection\.php', ))), # Extract links matching 'item.php' and parse them with the spider's method parse_item Rule( LinkExtractor( allow=('^https?://[^\?#=]*('+accept_keywords+')[^\?#=]*$'), # allow=(), deny=('.*('+except_keywords+').*', '('+except_keywords_start+').*'), # deny=(), allow_domains=(allowed_domains), deny_domains=(), deny_extensions=IGNORED_EXTENSIONS, restrict_xpaths=(), restrict_css=(), tags=('a', 'area'), attrs=('href', ), canonicalize=False, unique=True, process_value=None, strip=True ), callback='parse_item', follow=False ), ) logger = logging.getLogger() logger.LOG_FILE = 'qs.log' logger.setLevel(logging.INFO) #Include the start url in the rule using a scrapy CrawlSpider #So override parse_start_url() and set callback to it, and then call parse_item() # def parse_start_url(self, response): # self.logger.info('>>>>>>>> Parse start url: %s', response) # # print 'response:'+str(response) # return self.parse_item(response) def parse_item(self, response): # self.logger.info('allowed_domains: %s', self.allowed_domains) self.logger.info('>>>>>>>> Response page url: %s', response.url) if bool(re.match(self.except_keywords_regex, response.url)): self.logger.info('>>>>>>>> Ignoring response page url: %s', response.url) return sub_domain = tldextract.extract(response.url).subdomain domain = tldextract.extract(response.url).domain suffix = tldextract.extract(response.url).suffix page_root_domain = sub_domain + '.' + domain + '.' + suffix email_regex = r'^(mailto:)?[a-zA-Z0-9_.+-]+(@|[\[\(]?at[\]\)]?|[\[\(]?AT[\]\)]?)[a-zA-Z0-9-]+(\.|dot|DOT)[a-zA-Z0-9-.]+$' all_email_text = response.xpath('//body//*/text()').re(email_regex) email_count_in_text = len(all_email_text) all_email_link = response.xpath('//a/@href').re(email_regex) email_count_in_href = len(all_email_link) # yield { # 'email_count_in_text': email_count_in_text, # 'email_count_in_href': email_count_in_href # } # if email_count_in_text < 1 and email_count_in_href < 1: # self.logger.info('>>>>>>>> Ignoring response page url that contains no email: %s', response.url) # return # else: if email_count_in_text > 0 or email_count_in_href > 0: #Find logo in the page logo_type = '' logo_text = 'none' logo_element = response.xpath('//a') if logo_element.xpath('./text()').extract_first() is not None: logo_link = logo_element.xpath('./@href').extract_first() root_domain = re.sub(r'(http)s?://', '', logo_link) # is_logo = not bool(re.search(r'('+self.except_keywords_start+').*', logo_link)) is_logo = False if page_root_domain == root_domain or logo_link == '/' or logo_link.endswith('home'): is_logo = True if is_logo: if len(logo_element.xpath('./text()').extract_first()) > 0: logo_text = logo_element.xpath('./text()').extract_first() # print 'logo text:'+logo_text # logo_type = '_text' # if not bool(re.match(r'^[\r\n\s]+$', logo_text)): # yield { # 'logo_text': re.sub(r'[\r\n\s]+', ' ', logo_text) # } logo_text = re.sub(r'[\r\n\s]{2,}', ' ', logo_text) elif len(logo_element.xpath('./@title').extract_first()) > 0: logo_text = logo_element.xpath('./@title').extract_first() # print 'logo title:'+logo_text # logo_type = '_title' # if not bool(re.match(r'^[\r\n\s]+$', logo_text)): # yield { # 'logo_title': re.sub(r'[\r\n\s]+', ' ', logo_text) # } logo_text = re.sub(r'[\r\n\s]{2,}', ' ', logo_text) elif response.xpath('(//img)[1]').xpath('./text()').extract_first() is not None: logo_element = response.xpath('(//img)[1]') if len(logo_element.xpath('./@title').extract_first()) > 0: logo_text = logo_element.xpath('./@title').extract_first() # print 'logo img title:'+logo_text # logo_type = '_img_title' # if not bool(re.match(r'^[\r\n\s]+$', logo_text)): # yield { # 'logo_img_title': re.sub(r'[\r\n\s]+', ' ', logo_text) # } logo_text = re.sub(r'[\r\n\s]{2,}', ' ', logo_text) elif len(logo_element.xpath('./@alt').extract_first()) > 0: logo_text = logo_element.xpath('./@alt').extract_first() # print 'logo img alt:'+logo_text # logo_type = '_img_alt' # if not bool(re.match(r'^[\r\n\s]+$', logo_text)): # yield { # 'logo_img_alt': re.sub(r'[\r\n\s]+', ' ', logo_text) # } logo_text = re.sub(r'[\r\n\s]{2,}', ' ', logo_text) #Find header in the page header_text = '' header = response.xpath('(//header)[1]/*').extract_first() if header is not None and bool(re.match(header, r'^[a-zA-Z]+$')): header = re.sub(r'[\r\n\t]+', '', header) header = re.sub(r'\s+', ' ', header) header_text = header + ';' header_text_list = [] header_element = response.xpath('//div[contains(@*, "Header") or contains(@*, "header")]') for header in header_element: header_sub_text = header.xpath('./text()').extract_first() if header_sub_text is not None and bool(re.match(header_sub_text, r'^[a-zA-Z]+$')): header_sub_text = re.sub(r'[\r\n\t]+', '', header_sub_text) header_sub_text = re.sub(r'\s+', ' ', header_sub_text) header_text_list.append(header_sub_text + ';') header_text = header_text.join(header_text_list) #Find footer in the page footer_text = '' footer = response.xpath('(//footer)[1]/*').extract_first() if footer is not None and bool(re.match(footer, r'^[a-zA-Z]+$')): footer = re.sub(r'[\r\n\t]+', '', footer) footer = re.sub(r'\s+', ' ', footer) footer_text = footer + ';' footer_text_list = [] footer_element = response.xpath('//div[contains(@*, "Footer") or contains(@*, "footer")]') for footer in footer_element: footer_sub_text = footer.xpath('./text()').extract_first() if footer_sub_text is not None and bool(re.match(footer_sub_text, r'^[a-zA-Z]+$')): footer_sub_text = re.sub(r'\r\n', '', footer_sub_text) footer_sub_text = re.sub(r'\s+', ' ', footer_sub_text) footer_text_list.append(footer_sub_text + ';') footer_text = footer_text.join(footer_text_list) # yield { # 'footer_text': footer_text # } #Log response url meta yield { 'page_url': response.url, 'depth': response.meta['depth'], 'sub_domain': sub_domain, 'email_count_in_text': email_count_in_text, 'email_count_in_href': email_count_in_href, 'logo': logo_text, 'header': header_text, 'footer': footer_text } if email_count_in_href > 0: all_email_element = response.xpath('//a[starts-with(@href, "mailto:")]') # all_email_element = Selector(response=response).xpath('//a[re:test(@href, "'+email_regex+'")]') # print 'all_email_element:'+str(all_email_element) # all_email_element = response.xpath('//a[starts-with(@href, "mailto:")]') #XPath test regex : (//a[starts-with(@href, "mailto:")])[1]/../..//a[not(starts-with(@href, "mailto:"))] #XPath test regex : (//a[starts-with(@href, "mailto:")])[1]/ancestor-or-self::div[count(descendant::a) = 3] #XPath test regex : (//a[starts-with(@href, "mailto:")])[1]/ancestor-or-self::div[count(descendant::a) > 1 and count(descendant::a) < 5] #XPath test regex : (//a[starts-with(@href, "mailto:")])[1]/ancestor-or-self::*/h3 for email in all_email_element: link_info = link_info2 = link_info3 = link_info4 = '' link_text = email.xpath('./text()').extract_first() link_href = email.xpath('./@href').extract_first() is_plain_email = bool(re.search(email_regex, link_href, flags=re.IGNORECASE)) if is_plain_email: if link_text is not None: if bool(re.search(email_regex, link_text, flags=re.IGNORECASE)): # link_element = email.xpath('../../*[contains(@*, "title")]') # info_elements = email.xpath('../..') parent_element = email.xpath('name(..)').extract_first() if (parent_element == 'td'): table_element = email.xpath('ancestor-or-self::table') table_head_element = table_element.xpath('./thead/tr/th/text()') table_head_text = '' for head in table_head_element: table_head_text = table_head_text + head + ';' table_data_element = email.xpath('ancestor-or-self::tr/td/text()') table_data_text = '' for data in table_data_element: table_data_text = table_data_text + data + ';' yield { 'table_head_text': table_head_text, 'table_data_text': table_data_text, 'email': re.sub(r'mailto:', '', link_href, flags=re.IGNORECASE) } else: info_element = email.xpath('../..//a[not(starts-with(@href, "mailto:"))]') if info_element is not None: for info in info_element: info_text = info.xpath('./text()').extract_first() if info_text is not None and len(info_text) > 1: link_info = info_text break info_element = email.xpath('ancestor-or-self::*/h3') if info_element is not None: for info in info_element: info_text = info.xpath('./text()').extract_first() if info_text is not None and len(info_text) > 1: info_text = info_text + ';' link_info2 += info_text info_element = email.xpath('ancestor-or-self::*//h2') if info_element is not None: for info in info_element: info_text = info.xpath('./text()').extract_first() if info_text is not None and len(info_text) > 1: info_text = info_text + ';' link_info3 += info_text info_element = email.xpath('ancestor-or-self::*//h1') if info_element is not None: for info in info_element: info_text = info.xpath('./text()').extract_first() if info_text is not None and len(info_text) > 1: info_text = info_text + ';' link_info4 += info_text yield { 'email': re.sub(r'mailto:', '', link_href, flags=re.IGNORECASE), 'info': link_info, 'info2': link_info2, 'info3': link_info3, 'info4': link_info4, 'text': link_text } else: yield { 'email': re.sub(r'mailto:', '', link_href, flags=re.IGNORECASE), 'name': link_text, 'text': link_text } else: yield { 'email': re.sub(r'mailto:', '', link_href, flags=re.IGNORECASE), 'name': link_text, 'text': link_text } elif email_count_in_text > 0: all_email_text_element = response.xpath('//body//*') for text_element in all_email_text_element: text = text_element.xpath('./text()').extract_first() if bool(re.match(email_regex, text, flags=re.IGNORECASE)): info_element = text_element.xpath("ancestor-or-self::*[count(descendant::a) = 1]") if info_element is not None: for info in info_element: info_text = info.xpath('//h2/text()').extract_first() if info_text is not None and len(info_text) > 1: info_text = info_text + ';' link_info += info_text info_text = info.xpath('//h3/text()').extract_first() if info_text is not None and len(info_text) > 1: info_text = info_text + ';' link_info += info_text info_text = info.xpath('//h4/text()').extract_first() if info_text is not None and len(info_text) > 1: info_text = info_text + ';' link_info += info_text yield { 'email': text, 'info': link_info }