worker.rb 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. class Worker
  2. @@wl = WhatLanguage.new(:all)
  3. @@tgr = EngTagger.new
  4. @@validator = ValidationWorker.new
  5. def self.validate_email_list(filepath)
  6. domain="hushfling.com"
  7. website = Website.find_or_initialize_by(:domain => domain, :user_id => 1)
  8. website.save
  9. counter = 0
  10. file = File.open("/tmp/target_email_list.txt", "rb")
  11. file.each do |line|
  12. line.delete!("\n")
  13. email = EmailLead.find_or_create_by(:address => line, :domain => "hushfling.com", :user_id => 1)
  14. if email
  15. puts email.id
  16. puts email.address
  17. end
  18. end
  19. puts counter
  20. file.close
  21. end
  22. def self.post(url, path, body={})
  23. uri = URI.parse(url)
  24. http = Net::HTTP.new(uri.host, uri.port)
  25. request = Net::HTTP::Post.new(path)
  26. request.add_field('Content-Type', 'application/json')
  27. request.body = body.to_json
  28. response = http.request(request)
  29. response.body
  30. end
  31. def self.new_image_service
  32. system("curl -X PUT 'http://#{ENV['API_HOST']}:8081/services/imageserv' -d '{\"mllib\":\"caffe\",\"description\":\"image classification service\",\"type\":\"supervised\",\"parameters\":{\"input\":{\"connector\":\"image\"},\"mllib\":{\"nclasses\":1000}},\"model\":{\"repository\":\"/opt/models/ggnet/\"}}'")
  33. end
  34. def self.predict_image(url)
  35. body = {"service"=>"imageserv", "parameters"=>{"input"=>{"width"=>224, "height"=>224}, "output"=>{"best"=>3}}, "data"=>["#{url}"]}
  36. uri = URI.parse("http://#{ENV['API_HOST']}:8080")
  37. http = Net::HTTP.new(uri.host, uri.port)
  38. request = Net::HTTP::Post.new("/predict")
  39. request.add_field('Content-Type', 'application/json')
  40. request.body = body.to_json
  41. response = http.request(request)
  42. response.body
  43. end
  44. def self.predict_location(ip)
  45. body = {"service"=>"imageserv", "parameters"=>{"input"=>{"width"=>224, "height"=>224}, "output"=>{"best"=>3}}, "data"=>["#{url}"]}
  46. uri = URI.parse("http://#{ENV['API_HOST']}:8080")
  47. http = Net::HTTP.new(uri.host, uri.port)
  48. request = Net::HTTP::Post.new("/predict")
  49. request.add_field('Content-Type', 'application/json')
  50. request.body = body.to_json
  51. response = http.request(request)
  52. response.body
  53. result = Curl.get("http://#{ENV['API_HOST']}/website/locate", {:url => "http://google.com"})
  54. if result
  55. response = result.body_str
  56. end
  57. expect(response).to be_truthy
  58. expect(response).to have_content("CA")
  59. puts response
  60. end
  61. def self.train_image(url, tags=[])
  62. body = {"service"=>"imageserv", "async"=>true, "parameters"=>{"mllib"=>{"gpu"=>false, "net"=>{"batch_size"=>32}, "solver"=>{"test_interval"=>500, "iterations"=>30000, "base_lr"=>0.001, "stepsize"=>1000, "gamma"=>0.9}}, "input"=>{"connector"=>"image", "test_split"=>0.1, "shuffle"=>true, "width"=>224, "height"=>224}, "output"=>{"measure"=>["acc", "mcll", "f1"]}}, "data"=>tags}
  63. uri = URI.parse("http://#{ENV['API_HOST']}:8080")
  64. http = Net::HTTP.new(uri.host, uri.port)
  65. request = Net::HTTP::Post.new("/train")
  66. request.add_field('Content-Type', 'application/json')
  67. request.body = body.to_json
  68. response = http.request(request)
  69. response.body
  70. end
  71. def self.ocr_image(url)
  72. uri = URI.parse("http://#{ENV['API_HOST']}:9292")
  73. http = Net::HTTP.new(uri.host, uri.port)
  74. request = Net::HTTP::Post.new("/ocr")
  75. request.add_field('Content-Type', 'application/json')
  76. request.body = {:img_url => url, :worker => "tesseract"}.to_json
  77. response = http.request(request)
  78. response.body
  79. end
  80. def self.analyze_text(text)
  81. if text
  82. hash = {}
  83. tagged = @@tgr.add_tags(text)
  84. hash[:word_list] = @@tgr.get_words(text)
  85. hash[:nouns] = @@tgr.get_nouns(tagged)
  86. hash[:proper_nouns] = @@tgr.get_proper_nouns(tagged)
  87. hash[:past_tense_verbs] = @@tgr.get_past_tense_verbs(tagged)
  88. hash[:adjectives] = @@tgr.get_adjectives(tagged)
  89. hash[:noun_phrases] = @@tgr.get_noun_phrases(tagged)
  90. hash[:language] = @@wl.language(text)
  91. hash[:languages_ranked] = @@wl.process_text(text)
  92. hash[:profanity] = SadPanda.polarity (text)
  93. hash[:emotion] = SadPanda.emotion (text)
  94. hash[:reading_level] = Odyssey.coleman_liau (text)
  95. return hash
  96. else
  97. return false
  98. end
  99. end
  100. def self.crawl(url, user_id)
  101. job_id = SecureRandom.hex(8)
  102. qid = SpiderWorker.perform_async(url, user_id, job_id)
  103. if qid
  104. return qid
  105. else
  106. return false
  107. end
  108. end
  109. def self.get_page_rank(url)
  110. googlerank = GooglePageRank.get(url)
  111. if googlerank
  112. return googlerank
  113. else
  114. return false
  115. end
  116. end
  117. def self.extract_product(url)
  118. if url
  119. hash = {}
  120. product = Fletcher.fetch url
  121. hash[:product_name] = product.name # => "Avenir Deluxe Unicycle (20-Inch Wheel)"
  122. hash[:description] = product.description
  123. # hash[:image] = product.image.src || nil
  124. hash[:price] = product.price
  125. hash
  126. else
  127. return false
  128. end
  129. end
  130. def self.extract_entities(text)
  131. if text
  132. entities = @@ner.perform(text)
  133. if entities
  134. return entities
  135. else
  136. return false
  137. end
  138. end
  139. end
  140. def self.check_email(email_address)
  141. if email_address
  142. resp = EmailVerifier.check(email_address)
  143. if resp
  144. return resp
  145. else
  146. return false
  147. end
  148. end
  149. end
  150. def self.create_services
  151. json = '{
  152. "service":"imageserv",
  153. "parameters":{
  154. "mllib":{
  155. "gpu":true
  156. },
  157. "input":{
  158. "width":224,
  159. "height":224
  160. },
  161. "output":{
  162. "best":3,
  163. "template":"{ {{#body}}{{#predictions}} \"uri\":\"{{uri}}\",\"categories\": [ {{#classes}} { \"category\":\"{{cat}}\",\"score\":{{prob}} } {{^last}},{{/last}}{{/classes}} ] {{/predictions}}{{/body}} }",
  164. "network":{
  165. "url":"your-elasticsearch-server.com/images/img",
  166. "http_method":"POST"
  167. }
  168. }
  169. },
  170. "data":["http://i.ytimg.com/vi/0vxOhd4qlnA/maxresdefault.jpg"]
  171. }'
  172. result = system("curl -XPOST 'http://localhost:8080/predict' -d #{json}")
  173. end
  174. def self.validate_email(email_address, user_id)
  175. resp = @@validator.perform(email_address, user_id)
  176. if resp
  177. return resp
  178. else
  179. return false
  180. end
  181. end
  182. def self.analyze_email(email_address)
  183. if email_address
  184. hash = {}
  185. email_domain = email_address.to_s.split("@").last
  186. school = Swot::school_name email_address
  187. govt_domain = Gman.new email_address
  188. hash[:domain] = email_domain
  189. if school
  190. hash[:academia] ||= school
  191. end
  192. if govt_domain
  193. hash[:govt_agency] = govt_domain.agency
  194. # hash[:domain] ||= govt_domain.domain
  195. hash[:is_govt] = govt_domain.federal?
  196. hash[:academia] ||= false
  197. end
  198. return hash
  199. else
  200. return false
  201. end
  202. end
  203. def self.analyze_phone(phone_number)
  204. if phone_number
  205. hash = {}
  206. identifier = Phonelib.parse(phone_number)
  207. hash[:number] = phone_number.phony_formatted(:normalize => :US, :spaces => '-')
  208. if phone_number[0].to_s == "1"
  209. area = phone_number.to_s[1..3]
  210. else
  211. area = phone_number.to_s[0..2]
  212. end
  213. hash[:region] = Integer(area).to_region(:city => true)
  214. hash[:type] = identifier.human_type
  215. hash[:country] = identifier.country
  216. hash[:location] = identifier.geo_name
  217. return hash
  218. else
  219. return false
  220. end
  221. end
  222. def self.analyze_name(first_name, last_name)
  223. if first_name and last_name
  224. hash = {}
  225. hash[:gender] = Guess.gender(first_name.to_s.humanize)
  226. hash[:ethnicity] = $races[last_name.to_s.upcase]
  227. hash[:name] = [first_name, last_name].join(" ")
  228. return hash
  229. else
  230. return false
  231. end
  232. end
  233. def self.analyze_domain(domain_name)
  234. if domain_name
  235. url = "http://#{domain_name}"
  236. hash = {}
  237. doc = Pismo::Document.new url
  238. whois_data = Whois.whois(domain_name)
  239. googlerank = GooglePageRank.get(url)
  240. meta = MetaInspector.new(url)
  241. if doc and doc.title
  242. hash[:title] = doc.title
  243. hash[:author] = doc.author
  244. hash[:meta_keywords] = doc.keywords
  245. hash[:meta_description] = doc.description
  246. end
  247. if whois_data
  248. hash[:whois] = whois_data
  249. end
  250. if googlerank
  251. hash[:google_links] = googlerank.to_s
  252. end
  253. if meta
  254. hash[:meta] = meta.to_hash.to_s
  255. end
  256. return hash
  257. else
  258. return false
  259. end
  260. end
  261. def self.social_shares(social_media_url)
  262. if social_media_url
  263. result = SocialShares.all social_media_url
  264. return result
  265. else
  266. return false
  267. end
  268. end
  269. end