EclecticLabs
/
ContactRocket


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
							require 'objspace'
class SpiderWorker
  include Sidekiq::Worker
  include Sidekiq::Status::Worker
  include Sidekiq::Benchmark::Worker


  sidekiq_options :queue => 'crawler', :retry => false, :backtrace => true, expires_in: 1.hour #, throttle: { threshold: 10, period: 1.minute, key: ->(user_id){ user_id } }


  def perform(url, user_id, job_id)
    benchmark.spider_metric do

      @max_pages = 1000
      total @max_pages
      @profiles = []
      @addresses = []
      @numbers = []
      @original_url = url
      @pages = 0
      @current_user = User.find(user_id)
      @end_time = Time.now + 15.minutes

      if @current_user
        #if @current_user.websites.count > @current_user.max_targets
        #  return "[ERROR]  Max reached for user #{user_id}"
        #else
        @current_user.active_engines.incr
        @current_user.job_ids << job_id
        #end
      else
        return "[ERROR] User #{user_id} does not exist"
      end

      url.gsub!(" ", "")
      uri = URI.parse(url)
      @original_domain = uri.host || url.rpartition("://")[2].rpartition("/")[0]
      @website = Website.find_or_initialize_by(:domain => @original_domain, :user_id => user_id)
      @website.url = url
      @website.save
      @logger = Logger.new(STDOUT)

      @client = Elasticsearch::Client.new(url: ENV['ELASTICSEARCH_URL'], logger: @logger)
      @client.transport.logger.level = Logger::WARN
      if @current_user and @current_user.admin == true
        @storage = Polipus::Storage::ElasticSearchStore.new(
            @client,
            refresh: true
        )
        @storage.include_query_string_in_uuid = true
      else
        @storage = nil
      end


      @options = {
          :redis_options => {
              :host => 'localhost',
              :driver => 'hiredis',
              :db => 11},
          :depth_limit => 4,
          :discard_page_bodies => false,
          # HTTP read timeout in seconds
          :read_timeout => 10,
          # HTTP open connection timeout in seconds
          :open_timeout => 10,
          :obey_robots_txt => false,
          :logger => @logger,
          :skip_query_strings => false,
          :user_agent => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71",
          :enable_signal_handler => false,
          :workers => 4,
          :redirect_limit => 4,
          # :ttl => 900,
          :storage => @storage
      }


      Polipus.crawler(job_id, url, @options) do |crawler|

        crawler.skip_links_like(/\/versions\//)
        crawler.skip_links_like(/\.pdf$/)
        crawler.skip_links_like(/\.zip$/)
        crawler.skip_links_like(/\.jpg$/)
        crawler.skip_links_like(/\.png$/)
        crawler.skip_links_like(/\.PDF$/)
        crawler.skip_links_like(/\.JPG$/)
        crawler.skip_links_like(/\.PNG$/)
        crawler.skip_links_like(/\.GIF$/)
        crawler.skip_links_like(/\.EXE$/)
        crawler.skip_links_like(/\.gif$/)
        crawler.skip_links_like(/\.exe$/)
        crawler.skip_links_like(/\.mpg$/)
        crawler.skip_links_like(/\.avi$/)
        crawler.skip_links_like(/\.mp4$/)
        crawler.skip_links_like(/\.mpeg$/)
        crawler.skip_links_like(/\/images\//)

        crawler.on_page_downloaded do |crawled_page|
          @current_user.mileage.increment
          @current_user.pages_crawled.add crawled_page.url

          @pages += 1
          at @pages, "#{crawled_page.url}"
          if crawled_page.success?
            # @current_user.bandwidth_used.incr(ObjectSpace.memsize_of(crawled_page.body))
            if crawled_page.doc and crawled_page.doc.at('body')
              body_text = crawled_page.doc.at('html').text
            else
              body_text = crawled_page.body.to_s.force_encoding('UTF-8') || crawled_page.body.to_s
            end
            if body_text


              # Phone
              body_text.scan(/\(?([0-9]{3})\)?([ .-]?)([0-9]{3})\2([0-9]{4})/).each do |phone_number|
                if phone_number
                  phone_number = phone_number.to_s.scan(/\d/).join
                  @numbers << [phone_number, crawled_page.url]
                  # @current_user.notifications.add ["Phone: #{phone_number}", @original_domain]
                end
              end

              # Email
              body_text.scan(/[\w\d]+[\w\d.-]@[\w\d.-]+\.\w{2,6}/).each do |address|
                if address
                  @addresses << [address.to_s.downcase, crawled_page.url]
                  # @current_user.notifications.add ["Email: #{address.to_s.downcase}", @original_domain]
                end
              end

              # Twitter
              body_text.scan(/twitter\.com(?:\/\#!)?\/(\w+)/i).each do |twitter|
                if twitter and !twitter.join.match(".php")
                  @profiles << ["http://twitter.com/#{twitter.join.downcase}", crawled_page.url]
                  # @current_user.notifications.add ["Twitter: #{twitter.join.downcase}", @original_domain]
                end
              end

              # Facebook
              body_text.scan(/(?:https?:\/\/)?(?:www\.)?facebook\.com\/(?:(?:\w)*#!\/)?(?:pages\/)?(?:[\w\-]*\/)*([\w\-\.]*)/).each do |facebook|
                if facebook and !facebook.to_s.match(".php")
                  @profiles << ["http://facebook.com/#{facebook.join.downcase}", crawled_page.url]
                  # @current_user.notifications.add ["Facebook: #{facebook.join.downcase}", @original_domain]
                end
              end

              # LinkedIn
              [body_text.scan(/(?<=linkedin.com\/in\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.linkedin.com\/in\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.linkedin.com\/pub\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=linkedin.com\/pub\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=linkedin.com\/in\/)[a-z0-9_-]{3,16}/)].flatten.each do |linkedin|
                if linkedin
                  @profiles << ["http://linkedin.com/in/#{linkedin.downcase}", crawled_page.url]
                  # @current_user.notifications.add ["LinkedIn: #{linkedin.downcase}", @original_domain]
                end
              end

              # Google+
              body_text.scan(/(?<=plus.google.com\/)[a-z0-9_-]{3,16}/).each do |googleplus|
                if googleplus
                  @profiles << ["http://plus.google.com/+#{googleplus.downcase}", crawled_page.url]
                  #  @current_user.notifications.add ["Google+: #{googleplus.downcase}", @original_domain]
                end
              end

              # Instagram
              body_text.scan(/(?<=instagram.com\/)[a-z0-9_-]{3,16}/).each do |instagram|
                if instagram
                  @profiles << ["http://instagram.com/#{instagram.downcase}", crawled_page.url]
                  # @current_user.notifications.add ["Instagram: #{instagram.downcase}", @original_domain]
                end
              end

              # Pinterest
              body_text.scan(/(?<=pinterest.com\/)[a-z0-9_-]{3,16}/).each do |pinterest|
                if pinterest
                  @profiles << ["http://pinterest.com/#{pinterest.downcase}", crawled_page.url]
                  #  @current_user.notifications.add ["Pinterest: #{pinterest.downcase}", @original_domain]
                end
              end

              # Github
              [body_text.scan(/(?<=github.com\/user\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.github.com\/user\/)[a-z0-9_-]{3,16}/)].flatten.each do |username|
                if username
                  @profiles << ["http://github.com/#{username.downcase}", crawled_page.url]
                  # @current_user.notifications.add ["GitHub: #{username.downcase}", @original_domain]
                end
              end


              # Vimeo
              body_text.scan(/vimeo\.com(?:\/\#!)?\/(\w+)/i).each do |vimeo|
                if vimeo
                  @profiles << ["http://vimeo.com/#{vimeo.join.downcase}", crawled_page.url]
                  # @current_user.notifications.add ["Vimeo: #{vimeo.join.downcase}", @original_domain]
                end
              end


              # # lastfm
              # [body_text.scan(/(?<=lastfm.com\/user\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.lastfm.com\/user\/)[a-z0-9_-]{3,16}/)].flatten.each do |username|
              #   if username
              #     @profiles << ["http://lastfm.com/user/#{username.downcase}", crawled_page.url]
              #     @current_user.notifications.add ["LastFM: #{username.downcase}", @original_domain]
              #   end
              # end

              # Stumbleupon
              #   body_text.scan(/stumbleupon\.com(?:\/\#!)?\/(\w+)/i).each do |stumbleupon|
              #     if stumbleupon
              #       @profiles << ["http://stumbleupon.com/#{stumbleupon.join.downcase}", crawled_page.url]
              #       @current_user.notifications.add ["StumbleUpon: #{stumbleupon.join.downcase}", @original_domain]
              #     end
              #   end

              # Flickr
              #    body_text.scan(/flickr\.com(?:\/\#!)?\/(\w+)/i).each do |username|
              #      if username
              #        @profiles << ["http://flickr.com/#{username.join.downcase}", crawled_page.url]
              #        @current_user.notifications.add ["Flicker: #{username.downcase}", @original_domain]
              #      end
              #    end

              # Foursquare
              #    [body_text.scan(/(?<=foursquare.com\/user\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.foursquare.com\/user\/)[a-z0-9_-]{3,16}/)].flatten.each do |username|
              #      if username
              #        @profiles << ["http://foursquare.com/user/#{username.downcase}", crawled_page.url]
              #        @current_user.notifications.add ["Foursquare: #{username.downcase}", @original_domain]
              #      end
              #    end

              # SoundCloud
              #      body_text.scan(/soundcloud\.com(?:\/\#!)?\/(\w+)/i).each do |soundcloud|
              #        if soundcloud
              #          @profiles << ["http://soundcloud.com/#{soundcloud.join.downcase}", crawled_page.url]
              #          @current_user.notifications.add ["SoundCloud: #{soundcloud.join.downcase}", @original_domain]
              #        end
              #      end

              # Meetup
              #       [body_text.scan(/(?<=meetup.com\/members\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.meetup.com\/members\/)[a-z0-9_-]{3,16}/)].flatten.each do |meetup|
              #         if meetup
              #           @profiles << ["http://meetup.com/members/#{meetup.downcase}", crawled_page.url]
              #           @current_user.notifications.add ["Meetup: #{meetup.downcase}", @original_domain]
              #         end
              #       end

              # Reddit
              #   [body_text.scan(/(?<=reddit.com\/user\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.reddit.com\/user\/)[a-z0-9_-]{3,16}/)].flatten.each do |reddit|
              #     if reddit
              #       @profiles << ["http://reddit.com/user/#{reddit.downcase}", crawled_page.url]
              #       @current_user.notifications.add ["Reddit: #{reddit.downcase}", @original_domain]
              #     end
              #   end


            end
          end
        end

        crawler.on_crawl_end do
          unless @addresses.empty? and @profiles.empty? and @numbers.empty?
            @addresses.uniq! { |a| a.first }
            @profiles.uniq! { |p| p.first }
            @numbers.uniq! { |n| n.first }
            LeadWorker.perform_async(@addresses, @numbers, @profiles, @current_user.id, @website.id, @original_domain)
          end
          @current_user.job_ids.delete job_id
          @current_user.active_engines.decr
        end
      end
    end

    benchmark.finish
  end
end