okcupid_worker.rb 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. class OkcupidWorker
  2. include Sidekiq::Worker
  3. include Sidekiq::Status::Worker
  4. include Sidekiq::Benchmark::Worker
  5. sidekiq_options :queue => 'okcupid', :retry => false, :backtrace => true
  6. def perform(usernames=[])
  7. urls = []
  8. @profiles = []
  9. @addresses = []
  10. @numbers = []
  11. @current_user = User.find(10)
  12. usernames.each do |username|
  13. urls << "https://2-instant.okcupid.com/profile/#{username}"
  14. end
  15. @logger = Logger.new(STDOUT)
  16. @storage = nil
  17. @options = {
  18. :redis_options => {
  19. :host => 'localhost',
  20. :driver => 'hiredis',
  21. :db => 11},
  22. :depth_limit => 2,
  23. :discard_page_bodies => false,
  24. # HTTP read timeout in seconds
  25. :read_timeout => 30,
  26. # HTTP open connection timeout in seconds
  27. :open_timeout => 10,
  28. :obey_robots_txt => false,
  29. :logger => @logger,
  30. :skip_query_strings => false,
  31. :user_agent => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71",
  32. :enable_signal_handler => false,
  33. :workers => 5,
  34. :redirect_limit => 2,
  35. :storage => @storage
  36. }
  37. Polipus.crawler(job_id, urls, @options) do |crawler|
  38. crawler.skip_links_like(/\/versions\//)
  39. crawler.skip_links_like(/\.pdf$/)
  40. crawler.skip_links_like(/\.zip$/)
  41. crawler.skip_links_like(/\.jpg$/)
  42. crawler.skip_links_like(/\.png$/)
  43. crawler.skip_links_like(/\.PDF$/)
  44. crawler.skip_links_like(/\.JPG$/)
  45. crawler.skip_links_like(/\.PNG$/)
  46. crawler.skip_links_like(/\.GIF$/)
  47. crawler.skip_links_like(/\.EXE$/)
  48. crawler.skip_links_like(/\.gif$/)
  49. crawler.skip_links_like(/\.exe$/)
  50. crawler.skip_links_like(/\.mpg$/)
  51. crawler.skip_links_like(/\.avi$/)
  52. crawler.skip_links_like(/\.mp4$/)
  53. crawler.skip_links_like(/\.mpeg$/)
  54. crawler.skip_links_like(/\/images\//)
  55. crawler.on_page_downloaded do |crawled_page|
  56. if crawled_page.success? and crawled_page.body
  57. profile = SocialLead.new(:user_id => 10, :social_network => "okcupid",
  58. :username=> crawled_page.url.split("/profile/").last)
  59. profile.save
  60. body_text = crawled_page.body.force_encoding('UTF-8') || crawled_page.body
  61. if body_text
  62. # Phone Numbers
  63. body_text.scan(/\(?([0-9]{3})\)?([ .-]?)([0-9]{3})\2([0-9]{4})/).each do |phone_number|
  64. if phone_number
  65. phone_number = phone_number.to_s.scan(/\d/).join
  66. @numbers << [phone_number, crawled_page.url]
  67. end
  68. end
  69. # Email Addresses
  70. body_text.scan(/[\w\d]+[\w\d.-]@[\w\d.-]+\.\w{2,6}/).each do |address|
  71. if address
  72. @addresses << [address.to_s.downcase, crawled_page.url]
  73. end
  74. end
  75. # LinkedIn Profiles
  76. [body_text.scan(/(?<=linkedin.com\/in\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.linkedin.com\/in\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=www.linkedin.com\/pub\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=linkedin.com\/pub\/)[a-z0-9_-]{3,16}/), body_text.scan(/(?<=linkedin.com\/in\/)[a-z0-9_-]{3,16}/)].flatten.each do |linkedin|
  77. if linkedin
  78. @profiles << ["http://linkedin.com/in/#{linkedin.downcase}", crawled_page.url]
  79. end
  80. end
  81. # Google+ Profiles
  82. body_text.scan(/(?<=plus.google.com\/)[a-z0-9_-]{3,16}/).each do |googleplus|
  83. if googleplus
  84. @profiles << ["http://plus.google.com/+#{googleplus.downcase}", crawled_page.url]
  85. end
  86. end
  87. # Instagram Profiles
  88. body_text.scan(/(?<=instagram.com\/)[a-z0-9_-]{3,16}/).each do |instagram|
  89. if instagram
  90. @profiles << ["http://instagram.com/#{instagram.downcase}", crawled_page.url]
  91. end
  92. end
  93. # Pinterest Profiles
  94. body_text.scan(/(?<=pinterest.com\/)[a-z0-9_-]{3,16}/).each do |pinterest|
  95. if pinterest
  96. @profiles << ["http://pinterest.com/#{pinterest.downcase}", crawled_page.url]
  97. end
  98. end
  99. # Github Profiles
  100. body_text.scan(/github\.com(?:\/\#!)?\/(\w+)/i).each do |github|
  101. if github
  102. @profiles << ["http://github.com/#{github.join.downcase}", crawled_page.url]
  103. end
  104. end
  105. # Twitter Profiles
  106. body_text.scan(/twitter\.com(?:\/\#!)?\/(\w+)/i).each do |twitter|
  107. if twitter and !twitter.join.match(".php")
  108. @profiles << ["http://twitter.com/#{twitter.join.downcase}", crawled_page.url]
  109. end
  110. end
  111. # Facebook Profiles
  112. body_text.scan(/(?:https?:\/\/)?(?:www\.)?facebook\.com\/(?:(?:\w)*#!\/)?(?:pages\/)?(?:[\w\-]*\/)*([\w\-\.]*)/).each do |facebook|
  113. if facebook and !facebook.to_s.match(".php")
  114. @profiles << ["http://facebook.com/#{facebook.join.downcase}", crawled_page.url]
  115. end
  116. end
  117. end
  118. end
  119. end
  120. crawler.on_crawl_end do
  121. unless @addresses.empty? and @profiles.empty? and @numbers.empty?
  122. @addresses.uniq! { |a| a.first }
  123. @profiles.uniq! { |p| p.first }
  124. @numbers.uniq! { |n| n.first }
  125. LeadWorker.perform_async(@addresses, @numbers, @profiles, @current_user.id, 1, "okcupid.com")
  126. end
  127. end
  128. end
  129. end
  130. end