spider_worker.rb 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. require 'objspace'
  2. class SpiderWorker
  3. include Sidekiq::Worker
  4. include Sidekiq::Status::Worker
  5. include Sidekiq::Benchmark::Worker
  6. sidekiq_options :queue => 'crawler', :retry => false, :backtrace => true, expires_in: 1.hour #, throttle: { threshold: 10, period: 1.minute, key: ->(user_id){ user_id } }
  7. def perform(url, user_id, job_id)
  8. benchmark.spider_metric do
  9. @max_pages = 1000
  10. total @max_pages
  11. @profiles = []
  12. @addresses = []
  13. @numbers = []
  14. @original_url = url
  15. @pages = 0
  16. @current_user = User.find(user_id)
  17. @end_time = Time.now + 15.minutes
  18. if @current_user
  19. #if @current_user.websites.count > @current_user.max_targets
  20. # return "[ERROR] Max reached for user #{user_id}"
  21. #else
  22. @current_user.active_engines.incr
  23. @current_user.job_ids << job_id
  24. #end
  25. else
  26. return "[ERROR] User #{user_id} does not exist"
  27. end
  28. url.gsub!(" ", "")
  29. uri = URI.parse(url)
  30. @original_domain = uri.host || url.rpartition("://")[2].rpartition("/")[0]
  31. @website = Website.find_or_initialize_by(:domain => @original_domain, :user_id => user_id)
  32. @website.url = url
  33. @website.save
  34. @logger = Logger.new(STDOUT)
  35. @client = Elasticsearch::Client.new(url: ENV['ELASTICSEARCH_URL'], logger: @logger)
  36. @client.transport.logger.level = Logger::WARN
  37. if @current_user and @current_user.admin == true
  38. @storage = Polipus::Storage::ElasticSearchStore.new(
  39. @client,
  40. refresh: true
  41. )
  42. @storage.include_query_string_in_uuid = true
  43. else
  44. @storage = nil
  45. end
  46. @options = {
  47. :redis_options => {
  48. :host => 'localhost',
  49. :driver => 'hiredis',
  50. :db => 11},
  51. :depth_limit => 4,
  52. :discard_page_bodies => false,
  53. # HTTP read timeout in seconds
  54. :read_timeout => 10,
  55. # HTTP open connection timeout in seconds
  56. :open_timeout => 10,
  57. :obey_robots_txt => false,
  58. :logger => @logger,
  59. :skip_query_strings => false,
  60. :user_agent => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71",
  61. :enable_signal_handler => false,
  62. :workers => 4,
  63. :redirect_limit => 4,
  64. # :ttl => 900,
  65. :storage => @storage
  66. }
  67. Polipus.crawler(job_id, url, @options) do |crawler|
  68. crawler.skip_links_like(/\/versions\//)
  69. crawler.skip_links_like(/\.pdf$/)
  70. crawler.skip_links_like(/\.zip$/)
  71. crawler.skip_links_like(/\.jpg$/)
  72. crawler.skip_links_like(/\.png$/)
  73. crawler.skip_links_like(/\.PDF$/)
  74. crawler.skip_links_like(/\.JPG$/)
  75. crawler.skip_links_like(/\.PNG$/)
  76. crawler.skip_links_like(/\.GIF$/)
  77. crawler.skip_links_like(/\.EXE$/)
  78. crawler.skip_links_like(/\.gif$/)
  79. crawler.skip_links_like(/\.exe$/)
  80. crawler.skip_links_like(/\.mpg$/)
  81. crawler.skip_links_like(/\.avi$/)
  82. crawler.skip_links_like(/\.mp4$/)
  83. crawler.skip_links_like(/\.mpeg$/)
  84. crawler.skip_links_like(/\/images\//)
  85. crawler.on_page_downloaded do |crawled_page|
  86. @current_user.mileage.increment
  87. @current_user.pages_crawled.add crawled_page.url
  88. @pages += 1
  89. at @pages, "#{crawled_page.url}"
  90. if crawled_page.success?
  91. # @current_user.bandwidth_used.incr(ObjectSpace.memsize_of(crawled_page.body))
  92. if crawled_page.doc and crawled_page.doc.at('body')
  93. body_text = crawled_page.doc.at('html').text
  94. else
  95. body_text = crawled_page.body.to_s.force_encoding('UTF-8') || crawled_page.body.to_s
  96. end
  97. if body_text
  98. # Phone
  99. body_text.scan(/\(?([0-9]{3})\)?([ .-]?)([0-9]{3})\2([0-9]{4})/).each do |phone_number|
  100. if phone_number
  101. phone_number = phone_number.to_s.scan(/\d/).join
  102. @numbers << [phone_number, crawled_page.url]
  103. # @current_user.notifications.add ["Phone: #{phone_number}", @original_domain]
  104. end
  105. end
  106. # Email
  107. body_text.scan(/[\w\d]+[\w\d.-]@[\w\d.-]+\.\w{2,6}/).each do |address|
  108. if address
  109. @addresses << [address.to_s.downcase, crawled_page.url]
  110. # @current_user.notifications.add ["Email: #{address.to_s.downcase}", @original_domain]
  111. end
  112. end
  113. # Twitter
  114. body_text.scan(/twitter\.com(?:\/\#!)?\/(\w+)/i).each do |twitter|
  115. if twitter and !twitter.join.match(".php")
  116. @profiles << ["http://twitter.com/#{twitter.join.downcase}", crawled_page.url]
  117. # @current_user.notifications.add ["Twitter: #{twitter.join.downcase}", @original_domain]
  118. end
  119. end
  120. # Facebook
  121. body_text.scan(/(?:https?:\/\/)?(?:www\.)?facebook\.com\/(?:(?:\w)*#!\/)?(?:pages\/)?(?:[\w\-]*\/)*([\w\-\.]*)/).each do |facebook|
  122. if facebook and !facebook.to_s.match(".php")
  123. @profiles << ["http://facebook.com/#{facebook.join.downcase}", crawled_page.url]
  124. # @current_user.notifications.add ["Facebook: #{facebook.join.downcase}", @original_domain]
  125. end
  126. end
  127. # LinkedIn
  128. [body_text.scan(/(?<=linkedin.com\/in\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.linkedin.com\/in\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.linkedin.com\/pub\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=linkedin.com\/pub\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=linkedin.com\/in\/)[a-z0-9_-]{3,16}/)].flatten.each do |linkedin|
  129. if linkedin
  130. @profiles << ["http://linkedin.com/in/#{linkedin.downcase}", crawled_page.url]
  131. # @current_user.notifications.add ["LinkedIn: #{linkedin.downcase}", @original_domain]
  132. end
  133. end
  134. # Google+
  135. body_text.scan(/(?<=plus.google.com\/)[a-z0-9_-]{3,16}/).each do |googleplus|
  136. if googleplus
  137. @profiles << ["http://plus.google.com/+#{googleplus.downcase}", crawled_page.url]
  138. # @current_user.notifications.add ["Google+: #{googleplus.downcase}", @original_domain]
  139. end
  140. end
  141. # Instagram
  142. body_text.scan(/(?<=instagram.com\/)[a-z0-9_-]{3,16}/).each do |instagram|
  143. if instagram
  144. @profiles << ["http://instagram.com/#{instagram.downcase}", crawled_page.url]
  145. # @current_user.notifications.add ["Instagram: #{instagram.downcase}", @original_domain]
  146. end
  147. end
  148. # Pinterest
  149. body_text.scan(/(?<=pinterest.com\/)[a-z0-9_-]{3,16}/).each do |pinterest|
  150. if pinterest
  151. @profiles << ["http://pinterest.com/#{pinterest.downcase}", crawled_page.url]
  152. # @current_user.notifications.add ["Pinterest: #{pinterest.downcase}", @original_domain]
  153. end
  154. end
  155. # Github
  156. [body_text.scan(/(?<=github.com\/user\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.github.com\/user\/)[a-z0-9_-]{3,16}/)].flatten.each do |username|
  157. if username
  158. @profiles << ["http://github.com/#{username.downcase}", crawled_page.url]
  159. # @current_user.notifications.add ["GitHub: #{username.downcase}", @original_domain]
  160. end
  161. end
  162. # Vimeo
  163. body_text.scan(/vimeo\.com(?:\/\#!)?\/(\w+)/i).each do |vimeo|
  164. if vimeo
  165. @profiles << ["http://vimeo.com/#{vimeo.join.downcase}", crawled_page.url]
  166. # @current_user.notifications.add ["Vimeo: #{vimeo.join.downcase}", @original_domain]
  167. end
  168. end
  169. # # lastfm
  170. # [body_text.scan(/(?<=lastfm.com\/user\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.lastfm.com\/user\/)[a-z0-9_-]{3,16}/)].flatten.each do |username|
  171. # if username
  172. # @profiles << ["http://lastfm.com/user/#{username.downcase}", crawled_page.url]
  173. # @current_user.notifications.add ["LastFM: #{username.downcase}", @original_domain]
  174. # end
  175. # end
  176. # Stumbleupon
  177. # body_text.scan(/stumbleupon\.com(?:\/\#!)?\/(\w+)/i).each do |stumbleupon|
  178. # if stumbleupon
  179. # @profiles << ["http://stumbleupon.com/#{stumbleupon.join.downcase}", crawled_page.url]
  180. # @current_user.notifications.add ["StumbleUpon: #{stumbleupon.join.downcase}", @original_domain]
  181. # end
  182. # end
  183. # Flickr
  184. # body_text.scan(/flickr\.com(?:\/\#!)?\/(\w+)/i).each do |username|
  185. # if username
  186. # @profiles << ["http://flickr.com/#{username.join.downcase}", crawled_page.url]
  187. # @current_user.notifications.add ["Flicker: #{username.downcase}", @original_domain]
  188. # end
  189. # end
  190. # Foursquare
  191. # [body_text.scan(/(?<=foursquare.com\/user\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.foursquare.com\/user\/)[a-z0-9_-]{3,16}/)].flatten.each do |username|
  192. # if username
  193. # @profiles << ["http://foursquare.com/user/#{username.downcase}", crawled_page.url]
  194. # @current_user.notifications.add ["Foursquare: #{username.downcase}", @original_domain]
  195. # end
  196. # end
  197. # SoundCloud
  198. # body_text.scan(/soundcloud\.com(?:\/\#!)?\/(\w+)/i).each do |soundcloud|
  199. # if soundcloud
  200. # @profiles << ["http://soundcloud.com/#{soundcloud.join.downcase}", crawled_page.url]
  201. # @current_user.notifications.add ["SoundCloud: #{soundcloud.join.downcase}", @original_domain]
  202. # end
  203. # end
  204. # Meetup
  205. # [body_text.scan(/(?<=meetup.com\/members\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.meetup.com\/members\/)[a-z0-9_-]{3,16}/)].flatten.each do |meetup|
  206. # if meetup
  207. # @profiles << ["http://meetup.com/members/#{meetup.downcase}", crawled_page.url]
  208. # @current_user.notifications.add ["Meetup: #{meetup.downcase}", @original_domain]
  209. # end
  210. # end
  211. # Reddit
  212. # [body_text.scan(/(?<=reddit.com\/user\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.reddit.com\/user\/)[a-z0-9_-]{3,16}/)].flatten.each do |reddit|
  213. # if reddit
  214. # @profiles << ["http://reddit.com/user/#{reddit.downcase}", crawled_page.url]
  215. # @current_user.notifications.add ["Reddit: #{reddit.downcase}", @original_domain]
  216. # end
  217. # end
  218. end
  219. end
  220. end
  221. crawler.on_crawl_end do
  222. unless @addresses.empty? and @profiles.empty? and @numbers.empty?
  223. @addresses.uniq! { |a| a.first }
  224. @profiles.uniq! { |p| p.first }
  225. @numbers.uniq! { |n| n.first }
  226. LeadWorker.perform_async(@addresses, @numbers, @profiles, @current_user.id, @website.id, @original_domain)
  227. end
  228. @current_user.job_ids.delete job_id
  229. @current_user.active_engines.decr
  230. end
  231. end
  232. end
  233. benchmark.finish
  234. end
  235. end