moneyspider.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. import csv
  2. import logging
  3. import re
  4. from tldextract import tldextract
  5. import scrapy
  6. from scrapy.spiders import CrawlSpider, Rule
  7. from scrapy.linkextractors import LinkExtractor
  8. from scrapy import Selector
  9. class EmailItem(scrapy.Item):
  10. page_url = scrapy.Field()
  11. link = scrapy.Field()
  12. email = scrapy.Field()
  13. name = scrapy.Field()
  14. title = scrapy.Field()
  15. class MoneySpider(CrawlSpider):
  16. name = 'qs-contact-mit'
  17. allowed_domains = []
  18. file_CSV = open('companies.csv')
  19. data_CSV = csv.reader(file_CSV)
  20. list_CSV = list(data_CSV)
  21. start_urls = []
  22. for url in start_urls:
  23. home_domain = tldextract.extract(url).domain + '.' + tldextract.extract(url).suffix
  24. allowed_domains.append(home_domain)
  25. accept_keywords = ''.join([
  26. 'relations|partners|invest|investor|manager|managers|merchants|vendors|retailers|sellers|dispensaries|clinics|shareholder'
  27. ])
  28. accept_xpath = ''.join([
  29. '//a[contains(text(), "invest")',
  30. ' or contains(text(), "Invest")',
  31. ' or contains(text(), "relations")',
  32. ' or contains(text(), "Relations")',
  33. ' or contains(text(), "partners")',
  34. ' or contains(text(), "Partners")',
  35. ' or contains(text(), "sellers")',
  36. ' or contains(text(), "Sellers")',
  37. ' or contains(text(), "merchants")',
  38. ' or contains(text(), "Merchants")',
  39. ' or contains(text(), "vendors")',
  40. ' or contains(text(), "Vendors")',
  41. ' or contains(text(), "managers")',
  42. ' or contains(text(), "Managers")',
  43. ' or contains(text(), "investors")',
  44. ' or contains(text(), "Investors")',
  45. ']'
  46. ])
  47. except_keywords_regex = ''.join([
  48. '^https?://([^/]+/){5,}[^/]+/?$' #match url at least 6 directories ex. home/dir1/dir2/dir3/dir4/dir5/dir6
  49. # '^https?://(\D*\d\D*){3,}$'
  50. ])
  51. except_keywords = ''.join([
  52. 'youtube|google|mozilla|facebook|twitter|instagram|linkedin|pinterest'
  53. # '|([^/]+/){2,}[^/]+/?|java[Ss]cript'
  54. ])
  55. except_keywords_start = ''.join([
  56. '\?|#|java[Ss]cript|\.\.|tel\:|phone\:|fax\:'
  57. ])
  58. # common file extensions that are not followed if they occur in links
  59. IGNORED_EXTENSIONS = [
  60. # images
  61. 'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
  62. 'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'webp',
  63. # audio
  64. 'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff', 'webm',
  65. # video
  66. '3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'mpeg', 'qt', 'rm', 'swf', 'wmv', 'flv',
  67. 'm4a',
  68. # document
  69. 'pdf', 'xls', 'xlsx', 'doc', 'docx', 'ppt', 'pps', 'pptx', 'pptm', 'ppsx', 'ppsm', 'sldx', 'sldm'
  70. 'xps', 'rtf', 'odt', 'ods', 'odp', 'odg', 'odf',
  71. # other
  72. 'css', 'exe', 'bin', 'rss', 'zip', 'rar', '7z', 'gz', 'bz2', 'tar'
  73. ]
  74. rules = (
  75. # Extract links matching 'category.php' (but not matching 'subsection.php')
  76. # and follow links from them (since no callback means follow=True by default).
  77. # Rule(LinkExtractor(allow=('', ), deny=('subsection\.php', ))),
  78. # Extract links matching 'item.php' and parse them with the spider's method parse_item
  79. Rule(
  80. LinkExtractor(
  81. allow=('^https?://[^\?#=]*('+accept_keywords+')[^\?#=]*$'),
  82. # allow=(),
  83. deny=('.*('+except_keywords+').*', '('+except_keywords_start+').*'),
  84. # deny=(),
  85. allow_domains=(allowed_domains),
  86. deny_domains=(),
  87. deny_extensions=IGNORED_EXTENSIONS,
  88. restrict_xpaths=(),
  89. restrict_css=(),
  90. tags=('a', 'area'),
  91. attrs=('href', ),
  92. canonicalize=False,
  93. unique=True,
  94. process_value=None,
  95. strip=True
  96. ),
  97. callback='parse_item',
  98. follow=False
  99. ),
  100. )
  101. logger = logging.getLogger()
  102. logger.LOG_FILE = 'qs.log'
  103. logger.setLevel(logging.INFO)
  104. #Include the start url in the rule using a scrapy CrawlSpider
  105. #So override parse_start_url() and set callback to it, and then call parse_item()
  106. # def parse_start_url(self, response):
  107. # self.logger.info('>>>>>>>> Parse start url: %s', response)
  108. # # print 'response:'+str(response)
  109. # return self.parse_item(response)
  110. def parse_item(self, response):
  111. # self.logger.info('allowed_domains: %s', self.allowed_domains)
  112. self.logger.info('>>>>>>>> Response page url: %s', response.url)
  113. if bool(re.match(self.except_keywords_regex, response.url)):
  114. self.logger.info('>>>>>>>> Ignoring response page url: %s', response.url)
  115. return
  116. sub_domain = tldextract.extract(response.url).subdomain
  117. domain = tldextract.extract(response.url).domain
  118. suffix = tldextract.extract(response.url).suffix
  119. page_root_domain = sub_domain + '.' + domain + '.' + suffix
  120. email_regex = r'^(mailto:)?[a-zA-Z0-9_.+-]+(@|[\[\(]?at[\]\)]?|[\[\(]?AT[\]\)]?)[a-zA-Z0-9-]+(\.|dot|DOT)[a-zA-Z0-9-.]+$'
  121. all_email_text = response.xpath('//body//*/text()').re(email_regex)
  122. email_count_in_text = len(all_email_text)
  123. all_email_link = response.xpath('//a/@href').re(email_regex)
  124. email_count_in_href = len(all_email_link)
  125. # yield {
  126. # 'email_count_in_text': email_count_in_text,
  127. # 'email_count_in_href': email_count_in_href
  128. # }
  129. # if email_count_in_text < 1 and email_count_in_href < 1:
  130. # self.logger.info('>>>>>>>> Ignoring response page url that contains no email: %s', response.url)
  131. # return
  132. # else:
  133. if email_count_in_text > 0 or email_count_in_href > 0:
  134. #Find logo in the page
  135. logo_type = ''
  136. logo_text = 'none'
  137. logo_element = response.xpath('//a')
  138. if logo_element.xpath('./text()').extract_first() is not None:
  139. logo_link = logo_element.xpath('./@href').extract_first()
  140. root_domain = re.sub(r'(http)s?://', '', logo_link)
  141. # is_logo = not bool(re.search(r'('+self.except_keywords_start+').*', logo_link))
  142. is_logo = False
  143. if page_root_domain == root_domain or logo_link == '/' or logo_link.endswith('home'):
  144. is_logo = True
  145. if is_logo:
  146. if len(logo_element.xpath('./text()').extract_first()) > 0:
  147. logo_text = logo_element.xpath('./text()').extract_first()
  148. # print 'logo text:'+logo_text
  149. # logo_type = '_text'
  150. # if not bool(re.match(r'^[\r\n\s]+$', logo_text)):
  151. # yield {
  152. # 'logo_text': re.sub(r'[\r\n\s]+', ' ', logo_text)
  153. # }
  154. logo_text = re.sub(r'[\r\n\s]{2,}', ' ', logo_text)
  155. elif len(logo_element.xpath('./@title').extract_first()) > 0:
  156. logo_text = logo_element.xpath('./@title').extract_first()
  157. # print 'logo title:'+logo_text
  158. # logo_type = '_title'
  159. # if not bool(re.match(r'^[\r\n\s]+$', logo_text)):
  160. # yield {
  161. # 'logo_title': re.sub(r'[\r\n\s]+', ' ', logo_text)
  162. # }
  163. logo_text = re.sub(r'[\r\n\s]{2,}', ' ', logo_text)
  164. elif response.xpath('(//img)[1]').xpath('./text()').extract_first() is not None:
  165. logo_element = response.xpath('(//img)[1]')
  166. if len(logo_element.xpath('./@title').extract_first()) > 0:
  167. logo_text = logo_element.xpath('./@title').extract_first()
  168. # print 'logo img title:'+logo_text
  169. # logo_type = '_img_title'
  170. # if not bool(re.match(r'^[\r\n\s]+$', logo_text)):
  171. # yield {
  172. # 'logo_img_title': re.sub(r'[\r\n\s]+', ' ', logo_text)
  173. # }
  174. logo_text = re.sub(r'[\r\n\s]{2,}', ' ', logo_text)
  175. elif len(logo_element.xpath('./@alt').extract_first()) > 0:
  176. logo_text = logo_element.xpath('./@alt').extract_first()
  177. # print 'logo img alt:'+logo_text
  178. # logo_type = '_img_alt'
  179. # if not bool(re.match(r'^[\r\n\s]+$', logo_text)):
  180. # yield {
  181. # 'logo_img_alt': re.sub(r'[\r\n\s]+', ' ', logo_text)
  182. # }
  183. logo_text = re.sub(r'[\r\n\s]{2,}', ' ', logo_text)
  184. #Find header in the page
  185. header_text = ''
  186. header = response.xpath('(//header)[1]/*').extract_first()
  187. if header is not None and bool(re.match(header, r'^[a-zA-Z]+$')):
  188. header = re.sub(r'[\r\n\t]+', '', header)
  189. header = re.sub(r'\s+', ' ', header)
  190. header_text = header + ';'
  191. header_text_list = []
  192. header_element = response.xpath('//div[contains(@*, "Header") or contains(@*, "header")]')
  193. for header in header_element:
  194. header_sub_text = header.xpath('./text()').extract_first()
  195. if header_sub_text is not None and bool(re.match(header_sub_text, r'^[a-zA-Z]+$')):
  196. header_sub_text = re.sub(r'[\r\n\t]+', '', header_sub_text)
  197. header_sub_text = re.sub(r'\s+', ' ', header_sub_text)
  198. header_text_list.append(header_sub_text + ';')
  199. header_text = header_text.join(header_text_list)
  200. #Find footer in the page
  201. footer_text = ''
  202. footer = response.xpath('(//footer)[1]/*').extract_first()
  203. if footer is not None and bool(re.match(footer, r'^[a-zA-Z]+$')):
  204. footer = re.sub(r'[\r\n\t]+', '', footer)
  205. footer = re.sub(r'\s+', ' ', footer)
  206. footer_text = footer + ';'
  207. footer_text_list = []
  208. footer_element = response.xpath('//div[contains(@*, "Footer") or contains(@*, "footer")]')
  209. for footer in footer_element:
  210. footer_sub_text = footer.xpath('./text()').extract_first()
  211. if footer_sub_text is not None and bool(re.match(footer_sub_text, r'^[a-zA-Z]+$')):
  212. footer_sub_text = re.sub(r'\r\n', '', footer_sub_text)
  213. footer_sub_text = re.sub(r'\s+', ' ', footer_sub_text)
  214. footer_text_list.append(footer_sub_text + ';')
  215. footer_text = footer_text.join(footer_text_list)
  216. # yield {
  217. # 'footer_text': footer_text
  218. # }
  219. #Log response url meta
  220. yield {
  221. 'page_url': response.url,
  222. 'depth': response.meta['depth'],
  223. 'sub_domain': sub_domain,
  224. 'email_count_in_text': email_count_in_text,
  225. 'email_count_in_href': email_count_in_href,
  226. 'logo': logo_text,
  227. 'header': header_text,
  228. 'footer': footer_text
  229. }
  230. if email_count_in_href > 0:
  231. all_email_element = response.xpath('//a[starts-with(@href, "mailto:")]')
  232. # all_email_element = Selector(response=response).xpath('//a[re:test(@href, "'+email_regex+'")]')
  233. # print 'all_email_element:'+str(all_email_element)
  234. # all_email_element = response.xpath('//a[starts-with(@href, "mailto:")]')
  235. #XPath test regex : (//a[starts-with(@href, "mailto:")])[1]/../..//a[not(starts-with(@href, "mailto:"))]
  236. #XPath test regex : (//a[starts-with(@href, "mailto:")])[1]/ancestor-or-self::div[count(descendant::a) = 3]
  237. #XPath test regex : (//a[starts-with(@href, "mailto:")])[1]/ancestor-or-self::div[count(descendant::a) > 1 and count(descendant::a) < 5]
  238. #XPath test regex : (//a[starts-with(@href, "mailto:")])[1]/ancestor-or-self::*/h3
  239. for email in all_email_element:
  240. link_info = link_info2 = link_info3 = link_info4 = ''
  241. link_text = email.xpath('./text()').extract_first()
  242. link_href = email.xpath('./@href').extract_first()
  243. is_plain_email = bool(re.search(email_regex, link_href, flags=re.IGNORECASE))
  244. if is_plain_email:
  245. if link_text is not None:
  246. if bool(re.search(email_regex, link_text, flags=re.IGNORECASE)):
  247. # link_element = email.xpath('../../*[contains(@*, "title")]')
  248. # info_elements = email.xpath('../..')
  249. parent_element = email.xpath('name(..)').extract_first()
  250. if (parent_element == 'td'):
  251. table_element = email.xpath('ancestor-or-self::table')
  252. table_head_element = table_element.xpath('./thead/tr/th/text()')
  253. table_head_text = ''
  254. for head in table_head_element:
  255. table_head_text = table_head_text + head + ';'
  256. table_data_element = email.xpath('ancestor-or-self::tr/td/text()')
  257. table_data_text = ''
  258. for data in table_data_element:
  259. table_data_text = table_data_text + data + ';'
  260. yield {
  261. 'table_head_text': table_head_text,
  262. 'table_data_text': table_data_text,
  263. 'email': re.sub(r'mailto:', '', link_href, flags=re.IGNORECASE)
  264. }
  265. else:
  266. info_element = email.xpath('../..//a[not(starts-with(@href, "mailto:"))]')
  267. if info_element is not None:
  268. for info in info_element:
  269. info_text = info.xpath('./text()').extract_first()
  270. if info_text is not None and len(info_text) > 1:
  271. link_info = info_text
  272. break
  273. info_element = email.xpath('ancestor-or-self::*/h3')
  274. if info_element is not None:
  275. for info in info_element:
  276. info_text = info.xpath('./text()').extract_first()
  277. if info_text is not None and len(info_text) > 1:
  278. info_text = info_text + ';'
  279. link_info2 += info_text
  280. info_element = email.xpath('ancestor-or-self::*//h2')
  281. if info_element is not None:
  282. for info in info_element:
  283. info_text = info.xpath('./text()').extract_first()
  284. if info_text is not None and len(info_text) > 1:
  285. info_text = info_text + ';'
  286. link_info3 += info_text
  287. info_element = email.xpath('ancestor-or-self::*//h1')
  288. if info_element is not None:
  289. for info in info_element:
  290. info_text = info.xpath('./text()').extract_first()
  291. if info_text is not None and len(info_text) > 1:
  292. info_text = info_text + ';'
  293. link_info4 += info_text
  294. yield {
  295. 'email': re.sub(r'mailto:', '', link_href, flags=re.IGNORECASE),
  296. 'info': link_info,
  297. 'info2': link_info2,
  298. 'info3': link_info3,
  299. 'info4': link_info4,
  300. 'text': link_text
  301. }
  302. else:
  303. yield {
  304. 'email': re.sub(r'mailto:', '', link_href, flags=re.IGNORECASE),
  305. 'name': link_text,
  306. 'text': link_text
  307. }
  308. else:
  309. yield {
  310. 'email': re.sub(r'mailto:', '', link_href, flags=re.IGNORECASE),
  311. 'name': link_text,
  312. 'text': link_text
  313. }
  314. elif email_count_in_text > 0:
  315. all_email_text_element = response.xpath('//body//*')
  316. for text_element in all_email_text_element:
  317. text = text_element.xpath('./text()').extract_first()
  318. if bool(re.match(email_regex, text, flags=re.IGNORECASE)):
  319. info_element = text_element.xpath("ancestor-or-self::*[count(descendant::a) = 1]")
  320. if info_element is not None:
  321. for info in info_element:
  322. info_text = info.xpath('//h2/text()').extract_first()
  323. if info_text is not None and len(info_text) > 1:
  324. info_text = info_text + ';'
  325. link_info += info_text
  326. info_text = info.xpath('//h3/text()').extract_first()
  327. if info_text is not None and len(info_text) > 1:
  328. info_text = info_text + ';'
  329. link_info += info_text
  330. info_text = info.xpath('//h4/text()').extract_first()
  331. if info_text is not None and len(info_text) > 1:
  332. info_text = info_text + ';'
  333. link_info += info_text
  334. yield {
  335. 'email': text,
  336. 'info': link_info
  337. }