123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313 |
- class Worker
- @@wl = WhatLanguage.new(:all)
- @@tgr = EngTagger.new
- @@validator = ValidationWorker.new
- def self.validate_email_list(filepath)
- domain="hushfling.com"
- website = Website.find_or_initialize_by(:domain => domain, :user_id => 1)
- website.save
- counter = 0
- file = File.open("/tmp/target_email_list.txt", "rb")
- file.each do |line|
- line.delete!("\n")
- email = EmailLead.find_or_create_by(:address => line, :domain => "hushfling.com", :user_id => 1)
- if email
- puts email.id
- puts email.address
- end
- end
- puts counter
- file.close
- end
- def self.post(url, path, body={})
- uri = URI.parse(url)
- http = Net::HTTP.new(uri.host, uri.port)
- request = Net::HTTP::Post.new(path)
- request.add_field('Content-Type', 'application/json')
- request.body = body.to_json
- response = http.request(request)
- response.body
- end
- def self.new_image_service
- system("curl -X PUT 'http://#{ENV['API_HOST']}:8081/services/imageserv' -d '{\"mllib\":\"caffe\",\"description\":\"image classification service\",\"type\":\"supervised\",\"parameters\":{\"input\":{\"connector\":\"image\"},\"mllib\":{\"nclasses\":1000}},\"model\":{\"repository\":\"/opt/models/ggnet/\"}}'")
- end
- def self.predict_image(url)
- body = {"service"=>"imageserv", "parameters"=>{"input"=>{"width"=>224, "height"=>224}, "output"=>{"best"=>3}}, "data"=>["#{url}"]}
- uri = URI.parse("http://#{ENV['API_HOST']}:8080")
- http = Net::HTTP.new(uri.host, uri.port)
- request = Net::HTTP::Post.new("/predict")
- request.add_field('Content-Type', 'application/json')
- request.body = body.to_json
- response = http.request(request)
- response.body
- end
- def self.predict_location(ip)
- body = {"service"=>"imageserv", "parameters"=>{"input"=>{"width"=>224, "height"=>224}, "output"=>{"best"=>3}}, "data"=>["#{url}"]}
- uri = URI.parse("http://#{ENV['API_HOST']}:8080")
- http = Net::HTTP.new(uri.host, uri.port)
- request = Net::HTTP::Post.new("/predict")
- request.add_field('Content-Type', 'application/json')
- request.body = body.to_json
- response = http.request(request)
- response.body
- result = Curl.get("http://#{ENV['API_HOST']}/website/locate", {:url => "http://google.com"})
- if result
- response = result.body_str
- end
- expect(response).to be_truthy
- expect(response).to have_content("CA")
- puts response
- end
- def self.train_image(url, tags=[])
- body = {"service"=>"imageserv", "async"=>true, "parameters"=>{"mllib"=>{"gpu"=>false, "net"=>{"batch_size"=>32}, "solver"=>{"test_interval"=>500, "iterations"=>30000, "base_lr"=>0.001, "stepsize"=>1000, "gamma"=>0.9}}, "input"=>{"connector"=>"image", "test_split"=>0.1, "shuffle"=>true, "width"=>224, "height"=>224}, "output"=>{"measure"=>["acc", "mcll", "f1"]}}, "data"=>tags}
- uri = URI.parse("http://#{ENV['API_HOST']}:8080")
- http = Net::HTTP.new(uri.host, uri.port)
- request = Net::HTTP::Post.new("/train")
- request.add_field('Content-Type', 'application/json')
- request.body = body.to_json
- response = http.request(request)
- response.body
- end
- def self.ocr_image(url)
- uri = URI.parse("http://#{ENV['API_HOST']}:9292")
- http = Net::HTTP.new(uri.host, uri.port)
- request = Net::HTTP::Post.new("/ocr")
- request.add_field('Content-Type', 'application/json')
- request.body = {:img_url => url, :worker => "tesseract"}.to_json
- response = http.request(request)
- response.body
- end
- def self.analyze_text(text)
- if text
- hash = {}
- tagged = @@tgr.add_tags(text)
- hash[:word_list] = @@tgr.get_words(text)
- hash[:nouns] = @@tgr.get_nouns(tagged)
- hash[:proper_nouns] = @@tgr.get_proper_nouns(tagged)
- hash[:past_tense_verbs] = @@tgr.get_past_tense_verbs(tagged)
- hash[:adjectives] = @@tgr.get_adjectives(tagged)
- hash[:noun_phrases] = @@tgr.get_noun_phrases(tagged)
- hash[:language] = @@wl.language(text)
- hash[:languages_ranked] = @@wl.process_text(text)
- hash[:profanity] = SadPanda.polarity (text)
- hash[:emotion] = SadPanda.emotion (text)
- hash[:reading_level] = Odyssey.coleman_liau (text)
- return hash
- else
- return false
- end
- end
- def self.crawl(url, user_id)
- job_id = SecureRandom.hex(8)
- qid = SpiderWorker.perform_async(url, user_id, job_id)
- if qid
- return qid
- else
- return false
- end
- end
- def self.get_page_rank(url)
- googlerank = GooglePageRank.get(url)
- if googlerank
- return googlerank
- else
- return false
- end
- end
-
- def self.extract_product(url)
- if url
- hash = {}
- product = Fletcher.fetch url
- hash[:product_name] = product.name # => "Avenir Deluxe Unicycle (20-Inch Wheel)"
- hash[:description] = product.description
- # hash[:image] = product.image.src || nil
- hash[:price] = product.price
- hash
- else
- return false
- end
- end
- def self.extract_entities(text)
- if text
- entities = @@ner.perform(text)
- if entities
- return entities
- else
- return false
- end
- end
- end
- def self.check_email(email_address)
- if email_address
- resp = EmailVerifier.check(email_address)
- if resp
- return resp
- else
- return false
- end
- end
- end
- def self.create_services
- json = '{
- "service":"imageserv",
- "parameters":{
- "mllib":{
- "gpu":true
- },
- "input":{
- "width":224,
- "height":224
- },
- "output":{
- "best":3,
- "template":"{ {{#body}}{{#predictions}} \"uri\":\"{{uri}}\",\"categories\": [ {{#classes}} { \"category\":\"{{cat}}\",\"score\":{{prob}} } {{^last}},{{/last}}{{/classes}} ] {{/predictions}}{{/body}} }",
- "network":{
- "url":"your-elasticsearch-server.com/images/img",
- "http_method":"POST"
- }
- }
- },
- "data":["http://i.ytimg.com/vi/0vxOhd4qlnA/maxresdefault.jpg"]
- }'
- result = system("curl -XPOST 'http://localhost:8080/predict' -d #{json}")
- end
- def self.validate_email(email_address, user_id)
- resp = @@validator.perform(email_address, user_id)
- if resp
- return resp
- else
- return false
- end
- end
- def self.analyze_email(email_address)
- if email_address
- hash = {}
- email_domain = email_address.to_s.split("@").last
- school = Swot::school_name email_address
- govt_domain = Gman.new email_address
- hash[:domain] = email_domain
- if school
- hash[:academia] ||= school
- end
- if govt_domain
- hash[:govt_agency] = govt_domain.agency
- # hash[:domain] ||= govt_domain.domain
- hash[:is_govt] = govt_domain.federal?
- hash[:academia] ||= false
- end
- return hash
- else
- return false
- end
- end
- def self.analyze_phone(phone_number)
- if phone_number
- hash = {}
- identifier = Phonelib.parse(phone_number)
- hash[:number] = phone_number.phony_formatted(:normalize => :US, :spaces => '-')
- if phone_number[0].to_s == "1"
- area = phone_number.to_s[1..3]
- else
- area = phone_number.to_s[0..2]
- end
- hash[:region] = Integer(area).to_region(:city => true)
- hash[:type] = identifier.human_type
- hash[:country] = identifier.country
- hash[:location] = identifier.geo_name
- return hash
- else
- return false
- end
- end
- def self.analyze_name(first_name, last_name)
- if first_name and last_name
- hash = {}
- hash[:gender] = Guess.gender(first_name.to_s.humanize)
- hash[:ethnicity] = $races[last_name.to_s.upcase]
- hash[:name] = [first_name, last_name].join(" ")
- return hash
- else
- return false
- end
- end
- def self.analyze_domain(domain_name)
- if domain_name
- url = "http://#{domain_name}"
- hash = {}
- doc = Pismo::Document.new url
- whois_data = Whois.whois(domain_name)
- googlerank = GooglePageRank.get(url)
- meta = MetaInspector.new(url)
- if doc and doc.title
- hash[:title] = doc.title
- hash[:author] = doc.author
- hash[:meta_keywords] = doc.keywords
- hash[:meta_description] = doc.description
- end
- if whois_data
- hash[:whois] = whois_data
- end
- if googlerank
- hash[:google_links] = googlerank.to_s
- end
- if meta
- hash[:meta] = meta.to_hash.to_s
- end
- return hash
- else
- return false
- end
- end
- def self.social_shares(social_media_url)
- if social_media_url
- result = SocialShares.all social_media_url
- return result
- else
- return false
- end
- end
- end
|