Browse Source

extensions

Peter Alcock 8 months ago
parent
commit
51a1ff4c00
43 changed files with 2060 additions and 0 deletions
  1. 35 0
      polipus-cassandra/.gitignore
  2. 1 0
      polipus-cassandra/.ruby-version
  3. 2 0
      polipus-cassandra/Gemfile
  4. 22 0
      polipus-cassandra/LICENSE.txt
  5. 29 0
      polipus-cassandra/README.md
  6. 2 0
      polipus-cassandra/Rakefile
  7. 5 0
      polipus-cassandra/lib/polipus-cassandra.rb
  8. 30 0
      polipus-cassandra/lib/polipus-cassandra/policies/default.rb
  9. 1 0
      polipus-cassandra/lib/polipus-cassandra/policies/policies.rb
  10. 307 0
      polipus-cassandra/lib/polipus-cassandra/queue_overflow/cassandra_queue.rb
  11. 244 0
      polipus-cassandra/lib/polipus-cassandra/storage/cassandra_store.rb
  12. 30 0
      polipus-cassandra/polipus-cassandra.gemspec
  13. 174 0
      polipus-cassandra/spec/polipus-cassandra/storage/cassandra_store_spec.rb
  14. 44 0
      polipus-cassandra/spec/spec_helper.rb
  15. 35 0
      polipus-elasticsearch/.gitignore
  16. 2 0
      polipus-elasticsearch/Gemfile
  17. 22 0
      polipus-elasticsearch/LICENSE.txt
  18. 29 0
      polipus-elasticsearch/README.md
  19. 2 0
      polipus-elasticsearch/Rakefile
  20. 3 0
      polipus-elasticsearch/lib/polipus-elasticsearch.rb
  21. 169 0
      polipus-elasticsearch/lib/polipus-elasticsearch/index/page.rb
  22. 104 0
      polipus-elasticsearch/lib/polipus-elasticsearch/storage/elasticsearch_store.rb
  23. 27 0
      polipus-elasticsearch/polipus-elasticsearch.gemspec
  24. 175 0
      polipus-elasticsearch/spec/polipus-elasticsearch/storage/elasticsearch_store_spec.rb
  25. 51 0
      polipus-elasticsearch/spec/spec_helper.rb
  26. 17 0
      polipus-storage-mysql/.gitignore
  27. 11 0
      polipus-storage-mysql/.rubocop.yml
  28. 4 0
      polipus-storage-mysql/Gemfile
  29. 22 0
      polipus-storage-mysql/LICENSE.txt
  30. 44 0
      polipus-storage-mysql/README.md
  31. 2 0
      polipus-storage-mysql/Rakefile
  32. 126 0
      polipus-storage-mysql/lib/polipus/storage/mysql_store.rb
  33. 26 0
      polipus-storage-mysql/polipus-storage-mysql.gemspec
  34. 129 0
      polipus-storage-mysql/spec/mysql_storage_spec.rb
  35. 18 0
      polipus-storage-mysql/spec/spec_helper.rb
  36. 17 0
      polipus-storage-s3/.gitignore
  37. 4 0
      polipus-storage-s3/Gemfile
  38. 22 0
      polipus-storage-s3/LICENSE.txt
  39. 29 0
      polipus-storage-s3/README.md
  40. 2 0
      polipus-storage-s3/Rakefile
  41. 0 0
      polipus-storage-s3/lib/polipus/storage/s3_store.rb
  42. 24 0
      polipus-storage-s3/polipus-storage-s3.gemspec
  43. 18 0
      polipus-storage-s3/spec/spec_helper.rb

+ 35 - 0
polipus-cassandra/.gitignore

@@ -0,0 +1,35 @@
+*.gem
+*.rbc
+/.config
+/coverage/
+/InstalledFiles
+/pkg/
+/spec/reports/
+/test/tmp/
+/test/version_tmp/
+/tmp/
+
+## Specific to RubyMotion:
+.dat*
+.repl_history
+build/
+
+## Documentation cache and generated files:
+/.yardoc/
+/_yardoc/
+/doc/
+/rdoc/
+
+## Environment normalisation:
+/.bundle/
+/vendor/bundle
+/lib/bundler/man/
+
+# for a library or gem, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+Gemfile.lock
+# .ruby-version
+# .ruby-gemset
+
+# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
+.rvmrc

+ 1 - 0
polipus-cassandra/.ruby-version

@@ -0,0 +1 @@
+1.9.3-p551

+ 2 - 0
polipus-cassandra/Gemfile

@@ -0,0 +1,2 @@
+source 'https://rubygems.org'
+gemspec

+ 22 - 0
polipus-cassandra/LICENSE.txt

@@ -0,0 +1,22 @@
+Copyright (c) 2015 Stefano Fontanelli
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 29 - 0
polipus-cassandra/README.md

@@ -0,0 +1,29 @@
+# Polipus: addons for Cassandra
+
+TODO: Write a gem description
+
+## Installation
+
+Add this line to your application's Gemfile:
+
+    gem 'polipus-cassandra'
+
+And then execute:
+
+    $ bundle
+
+Or install it yourself as:
+
+    $ gem install polipus-cassandra
+
+## Usage
+
+TODO: Write usage instructions here
+
+## Contributing
+
+1. Fork it ( http://github.com/<my-github-username>/polipus-storage-s3/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

+ 2 - 0
polipus-cassandra/Rakefile

@@ -0,0 +1,2 @@
+# coding: utf-8
+require 'bundler/gem_tasks'

+ 5 - 0
polipus-cassandra/lib/polipus-cassandra.rb

@@ -0,0 +1,5 @@
+# encoding: UTF-8
+
+require 'polipus-cassandra/policies/policies'
+require 'polipus-cassandra/queue_overflow/cassandra_queue'
+require 'polipus-cassandra/storage/cassandra_store'

+ 30 - 0
polipus-cassandra/lib/polipus-cassandra/policies/default.rb

@@ -0,0 +1,30 @@
+# encoding: utf-8
+require 'cassandra'
+
+module Corm
+  module Retry
+    module Policies
+      class Default
+        include Cassandra::Retry::Policy
+
+        def read_timeout(_statement, consistency, _required, _received, retrieved, retries)
+          return reraise if retries >= 5
+          sleep(retries.to_f + Random.rand(0.0..1.0))
+          retrieved ? reraise : try_again(consistency)
+        end
+
+        def write_timeout(_statement, consistency, _type, _required, _received, retries)
+          return reraise if retries >= 5
+          sleep(retries.to_f + Random.rand(0.0..1.0))
+          try_again(consistency)
+        end
+
+        def unavailable(_statement, consistency, _required, _alive, retries)
+          return reraise if retries >= 5
+          sleep(retries.to_f + Random.rand(0.0..1.0))
+          try_again(consistency)
+        end
+      end
+    end
+  end
+end

+ 1 - 0
polipus-cassandra/lib/polipus-cassandra/policies/policies.rb

@@ -0,0 +1 @@
+require 'polipus-cassandra/policies/default'

+ 307 - 0
polipus-cassandra/lib/polipus-cassandra/queue_overflow/cassandra_queue.rb

@@ -0,0 +1,307 @@
+# encoding: UTF-8
+require 'cassandra'
+require 'polipus'
+
+module Polipus
+  module QueueOverflow
+    class CassandraQueue
+
+      # CassandraQueue wants to persists documents (please, still ignore the
+      # jargon inherited from Mongo) like the following JSON-ish entry.
+      #
+      # There is no superclass here but I've in mind the interface implicitly
+      # defined by Polipus::QueueOverflow::DevNullQueue that, more or less has:
+      #
+      # def initialize
+      # def length
+      # def empty?
+      # def clear
+      # def push(_data)
+      # def pop(_ = false)
+      #
+      # Taking some data from our backend.production.*****.com/polipus
+      # I found:
+      #
+      # mongos> db.getCollectionNames()
+      # [
+      #         "data-com-companies",
+      #         "data_com_companies",
+      #         "googleplus",
+      #         "linkedin",
+      #         "linkedin-companies",
+      #         "linkedin_companies_parsed",
+      #         "linkedin_jobs",
+      #         "linkedin_jobs_parsed",
+      #         "linkedin_pages_errors",
+      #         "polipus_q_overflow_data-com-companies_queue_overflow",
+      #         "polipus_q_overflow_data_com_companies_queue_overflow",
+      #         "polipus_q_overflow_googleplus_queue_overflow",
+      #         "polipus_q_overflow_linkedin-companies_queue_overflow",
+      #         "polipus_q_overflow_linkedin_jobs_queue_overflow",
+      #         "polipus_q_overflow_linkedin_jobs_queue_overflow_old",
+      #         "polipus_q_overflow_linkedin_refresh_queue_overflow",
+      #         "system.indexes"
+      # ]
+      #
+      # mongos> db.getCollection("polipus_q_overflow_linkedin_jobs_queue_overflow").find().limit(1)
+      # {
+      #   "_id" : ObjectId("54506b98e3d55b20c40b32d3"),
+      #   "payload" : "{\"url\":\"https://www.linkedin.com/job/product-designer-jobs/?page_num=7&trk=jserp_pagination_next\",\"depth\":6,\"referer\":\"https://www.linkedin.com/job/product-designer-jobs/?page_num=6&trk=jserp_pagination_6\",\"fetched\":false}"
+      # }
+      #
+      # mongos> db.polipus_q_overflow_linkedin_refresh_queue_overflow.find().limit(10)
+      # {
+      #   "_id" : ObjectId("544072b6e3d55b0db7000001"),
+      #   "payload" : "{\"url\":\"http://www.linkedin.com/in/*****\",\"depth\":0,\"fetched\":false}"
+      # }
+      #
+      # We also assume this MonkeyPatch:
+      # Polipus::QueueOverflow.cassandra_queue(namespace, options = {})
+      # that returns instances of this class.
+
+      attr_accessor :cluster, :keyspace, :table
+
+      # There is a validation enforced to `:keyspace` and `:table` because
+      # Cassandra is not happy when a keyspace or a table name contains an
+      # hyphen.
+      def initialize(options = {})
+        raise ArgumentError unless options_are_valid?(options)
+        @cluster = options[:cluster]
+        @keyspace = options[:keyspace].gsub("-", "_")
+        @table = options[:table].gsub("-", "_")
+        @semaphore = Mutex.new
+        @options = options
+        @timeuuid_generator = Cassandra::Uuid::Generator.new
+        @logger = @options[:logger] ||= Logger.new(STDOUT).tap { |l| l.level = Logger::INFO }
+      end
+
+      # Length aka Size aka Count is supported in Cassandra... like your POSQL
+      # you can COUNT.
+      #
+      # SELECT COUNT (*) FROM keyspace.table_name;
+      #
+      # TBH I'm not sure if being "defensive" and returning 0/nil in case
+      # the results is_empty? ... I'm leaving (now) the code simple and noisy
+      # if something went wrong in the COUNT.
+      def length
+        table_ = [keyspace, table].compact.join '.'
+        statement = "SELECT COUNT (*) FROM #{table_} ;"
+        result = session.execute(statement)
+        result.first['count']
+      end
+
+      # Return true if the table has no rows.
+      # This is achieved with a 'SELECT WITH LIMIT 1' query.
+      def empty?
+        return get.first.nil?
+      end
+
+      # Clear is a fancy name for a DROP TABLE IF EXISTS <table_>.
+      def clear
+        table_ = [keyspace, table].compact.join '.'
+        statement = "DROP TABLE IF EXISTS #{table_} ;"
+        session.execute(statement)
+      end
+
+      # push is your the "write into Cassandra" method.
+      def push(data)
+        return nil if data.nil?
+        obj = MultiJson.decode(data)
+
+        table_ = [keyspace, table].compact.join('.')
+        queue_name = @keyspace
+        created_at = @timeuuid_generator.now
+
+        begin
+          @semaphore.synchronize do
+
+            if obj.has_key?('payload') && !obj['payload'].empty?
+              payload = MultiJson.encode(obj['payload'])
+            else
+              payload = nil
+            end
+
+            column_names = %w[ queue_name created_at payload ]
+            values_placeholders = column_names.map{|_| '?'}.join(',')
+            statement = "INSERT INTO #{table_} ( #{column_names.join(',')} ) VALUES (#{values_placeholders});"
+
+            session.execute(
+              session.prepare(statement),
+              arguments: [
+                queue_name,
+                created_at,
+                payload
+              ])
+          end
+        rescue Encoding::UndefinedConversionError
+          puts $!.error_char.dump
+          puts $!.error_char.encoding
+        end
+
+        @logger.debug { "Writing this entry [#{[queue_name, created_at].to_s}]" }
+        [queue_name, created_at].to_s
+      end
+
+      # Pop removes 'n' entries from the overflow table (treated as a queue)
+      # and returns a paged result.
+      # results.class #=> Cassandra::Results::Paged
+      #
+      # Polipus is expecting a String, that will be JSONparsed with the purpose
+      # to build a
+      def pop(n = 1)
+        # A recap: pop should remove oldest N messages and return to the caller.
+        #
+        # Let's see how this queue is implemented.
+        # In redis, messages are LPUSH-ed:
+        #
+        #  4 - 3 - 2 - 1 --> REDIS
+        #      4 - 3 - 2 --> REDIS
+        #          4 - 3 --> REDIS
+        #              4 --> REDIS
+        #
+        # Then, in the fast_dequeue, are RPOP-ped:
+        #
+        # REDIS --> 1
+        # REDIS --> 2 - 1
+        # REDIS --> 3 - 2 - 1
+        # REDIS --> 4 - 3 - 2 - 1
+        #
+        # Then, are received in this order:
+        # [1] -> TimeUUID(1) = ...
+        # [2] -> TimeUUID(1) = ...
+        # [3] -> TimeUUID(1) = ...
+        # [4] -> TimeUUID(1) = ...
+        #
+        # As you can see below, are ORDER BY (created_at ASC)... that means
+        # "olders first". When using 'LIMIT n' in a query, you get the 'n'
+        # olders entries.
+        #
+        # cqlsh> SELECT  * FROM  polipus_queue_overflow_linkedin.linkedin_overflow ;
+        #
+        #  queue_name                      | created_at                           | payload
+        # ---------------------------------+--------------------------------------+---------
+        #  polipus_queue_overflow_linkedin | 4632d49c-1c04-11e5-844b-0b314c777502 |     "1"
+        #  polipus_queue_overflow_linkedin | 46339f8a-1c04-11e5-844b-0b314c777502 |     "2"
+        #  polipus_queue_overflow_linkedin | 46349962-1c04-11e5-844b-0b314c777502 |     "3"
+        #  polipus_queue_overflow_linkedin | 46351860-1c04-11e5-844b-0b314c777502 |     "4"
+        #
+        # (4 rows)
+        # cqlsh> SELECT  * FROM  polipus_queue_overflow_linkedin.linkedin_overflow LIMIT 1;
+        #
+        #  queue_name                      | created_at                           | payload
+        # ---------------------------------+--------------------------------------+---------
+        #  polipus_queue_overflow_linkedin | 4632d49c-1c04-11e5-844b-0b314c777502 |     "1"
+        #
+        # (1 rows)
+        #
+        table_ = [keyspace, table].compact.join '.'
+        results = get(n)
+        results.each do |entry|
+          statement = "DELETE FROM #{table_} WHERE queue_name = '#{entry['queue_name']}' AND created_at = #{entry['created_at']} ;"
+          session.execute(statement)
+        end
+
+        # Let's rispect the API as expected by Polipus.
+        # Otherwise the execute returns a Cassandra::Results::Paged
+        if !results.nil? && results.respond_to?(:count) && results.count == 1
+          return results.first['payload']
+        end
+        return results
+      end
+
+      alias_method :size, :length
+      alias_method :dec, :pop
+      alias_method :shift, :pop
+      alias_method :enc, :push
+      alias_method :<<, :push
+
+      def keyspace!(replication = nil, durable_writes = true)
+        replication ||= "{'class': 'SimpleStrategy', 'replication_factor': '3'}"
+        statement = "CREATE KEYSPACE IF NOT EXISTS #{keyspace} WITH replication = #{replication} AND durable_writes = #{durable_writes};"
+        cluster.connect.execute(statement)
+      end
+
+      def session
+        @session ||= @cluster.connect(keyspace)
+      end
+
+      # Taking a look in the Cassandra KEYSPACE you will found:
+      #
+      # cqlsh> DESCRIBE KEYSPACE polipus_queue_overflow_linkedin ;
+      #
+      # CREATE KEYSPACE polipus_queue_overflow_linkedin WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '3'}  AND durable_writes = true;
+      #
+      # CREATE TABLE polipus_queue_overflow_linkedin.linkedin_overflow (
+      #     queue_name text,
+      #     created_at timeuuid,
+      #     payload text,
+      #     PRIMARY KEY (queue_name, created_at)
+      # ) WITH CLUSTERING ORDER BY (created_at ASC)
+      #     AND bloom_filter_fp_chance = 0.01
+      #     AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
+      #     AND comment = ''
+      #     AND compaction = {'min_threshold': '4', 'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32'}
+      #     AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
+      #     AND dclocal_read_repair_chance = 0.1
+      #     AND default_time_to_live = 0
+      #     AND gc_grace_seconds = 864000
+      #     AND max_index_interval = 2048
+      #     AND memtable_flush_period_in_ms = 0
+      #     AND min_index_interval = 128
+      #     AND read_repair_chance = 0.0
+      #     AND speculative_retry = '99.0PERCENTILE';
+      #
+      # This means that:
+      # - queue_name is partition key;
+      # - created_at is clustering key;
+      #
+      # With sample data:
+      #
+      # cqlsh> SELECT  * FROM  polipus_queue_overflow_linkedin.linkedin_overflow LIMIT 1 ;
+      #
+      #  queue_name                      | created_at                           | payload
+      # ---------------------------------+--------------------------------------+---------------------------------------------------------------------------------+
+      #  polipus_queue_overflow_linkedin | de17ece6-1e5e-11e5-b997-47a87c40c422 | "{\"url\":\"http://www.linkedin.com/in/foobar\",\"depth\":0,\"fetched\":false}"
+      #
+      # (1 rows)
+      # cqlsh>
+      #
+      def table!(properties = nil)
+        table_ = [keyspace, table].compact.join '.'
+        def_ = "CREATE TABLE IF NOT EXISTS #{table_}
+          (
+            queue_name TEXT,
+            created_at TIMEUUID,
+            payload TEXT,
+            PRIMARY KEY (queue_name, created_at)
+          )"
+        props = Array(properties).join(' AND ')
+        statement = props.empty? ? "#{def_};" : "#{def_} WITH #{props};"
+        session.execute(statement)
+      end
+
+      private
+
+      def options_are_valid?(options)
+        options.has_key?(:cluster) && options.has_key?(:keyspace) && options.has_key?(:table)
+      end
+
+      def limit_is_valid?(limit)
+        !limit.nil? && limit.respond_to?(:to_i) && limit.to_i > 0
+      end
+
+      # results.class => Cassandra::Results::Paged
+      def get(limit = 1)
+        # coerce to int if a TrueClass/FalseClass is given.
+        limit = 1 if [true, false].include?(limit)
+
+        raise ArgumentError.new("Invalid limit value: must be an INTEGER greater than 1 (got #{limit.inspect}).") unless limit_is_valid?(limit)
+        table_ = [keyspace, table].compact.join '.'
+        statement = "SELECT queue_name, created_at, payload FROM #{table_} LIMIT #{limit.to_i} ;"
+        @semaphore.synchronize do
+          return session.execute(session.prepare(statement), arguments: [])
+        end
+      end
+    end
+  end
+end

+ 244 - 0
polipus-cassandra/lib/polipus-cassandra/storage/cassandra_store.rb

@@ -0,0 +1,244 @@
+# encoding: UTF-8
+require 'cassandra'
+require 'multi_json'
+require 'polipus'
+require 'thread'
+require 'zlib'
+
+module Polipus
+  module Storage
+    class CassandraStore < Base
+
+      # CassandraStore wants to persists documents (please ignore the jargon
+      # inherited from MongoDB) like the following JSON-ish entry:
+      #
+      # > db['linkedin-refresh'].find({})
+      #
+      #   {
+      #     "_id" : ObjectId("...."),
+      #     "url" : "https://www.awesome.org/meh",
+      #     "code" : 200,
+      #     "depth" : 0,
+      #     "referer" : "",
+      #     "redirect_to" : "",
+      #     "response_time" : 1313,
+      #     "fetched" : true,
+      #     "user_data" :
+      #       {
+      #         "imported" : false,
+      #         "is_developer" : false,
+      #         "last_modified" : null
+      #       },
+      #      "fetched_at" : 1434977757,
+      #      "error" : "",
+      #      "uuid" : "4ddce293532ea2454356a4210e61c363"
+      #  }
+
+      attr_accessor :cluster, :keyspace, :table
+
+      BINARY_FIELDS = %w(body headers user_data)
+
+      def initialize(options = {})
+        @cluster = options[:cluster]
+        @keyspace = options[:keyspace]
+        @table = options[:table]
+        @except = options[:except] || []
+        @semaphore = Mutex.new
+      end
+
+      # {
+      #   'url'           => @url.to_s,
+      #   'headers'       => Marshal.dump(@headers),
+      #   'body'          => @body,
+      #   'links'         => links.map(&:to_s),
+      #   'code'          => @code,
+      #   'depth'         => @depth,
+      #   'referer'       => @referer.to_s,
+      #   'redirect_to'   => @redirect_to.to_s,
+      #   'response_time' => @response_time,
+      #   'fetched'       => @fetched,
+      #   'user_data'     => @user_data.nil? ? {} : @user_data.marshal_dump,
+      #   'fetched_at'    => @fetched_at,
+      #   'error'         => @error.to_s
+      # }
+
+      def add(page)
+        @semaphore.synchronize do
+          table_ = [keyspace, table].compact.join '.'
+          uuid_ = uuid(page)
+          obj = page.to_hash
+          Array(@except).each { |e| obj.delete(e.to_s) }
+
+          begin
+            BINARY_FIELDS.each do |field|
+              obj[field] = obj[field].to_s.encode('UTF-8', {
+                invalid: :replace,
+                undef: :replace,
+                replace: '?' }) if can_be_converted?(obj[field])
+              # ec = Encoding::Converter.new("ASCII-8BIT", "UTF-8")
+              # obj[field] = ec.convert(obj[field]) if can_be_converted?(obj[field])
+              # obj[field] = obj[field].force_encoding('ASCII-8BIT').force_encoding('UTF-8') if can_be_converted?(obj[field])
+            end
+
+            json = MultiJson.encode(obj)
+
+            url = obj.fetch('url', nil)
+            code = obj.fetch('code', nil)
+            depth = obj.fetch('depth', nil)
+            referer = obj.fetch('referer', nil)
+            redirectto = obj.fetch('redirect_to', nil)
+            response_time = obj.fetch('response_time', nil)
+            fetched = obj.fetch('fetched', nil)
+            error = obj.fetch('error', nil)
+            page = Zlib::Deflate.deflate(json)
+
+            if obj.has_key?('user_data') && !obj['user_data'].empty?
+              user_data = MultiJson.encode(obj['user_data'])
+            else
+              user_data = nil
+            end
+
+            value = obj.fetch('fetched_at', nil)
+            fetched_at = case value
+            when Fixnum
+              Time.at(value)
+            when String
+              Time.parse(value)
+            else
+              nil
+            end
+
+            column_names = %w[ uuid url code depth referer redirect_to response_time fetched user_data fetched_at error page ]
+            values_placeholders = column_names.map{|_| '?'}.join(',')
+            statement = "INSERT INTO #{table_} ( #{column_names.join(',')} ) VALUES (#{values_placeholders});"
+
+            session.execute(
+              session.prepare(statement),
+              arguments: [
+                uuid_,
+                url,
+                code,
+                depth,
+                referer,
+                redirectto,
+                response_time,
+                fetched,
+                user_data,
+                fetched_at,
+                error,
+                page
+              ])
+
+          rescue Encoding::UndefinedConversionError
+            puts $!.error_char.dump
+            puts $!.error_char.encoding
+          end
+
+          uuid_
+        end
+      end
+
+      def clear
+        table_ = [keyspace, table].compact.join '.'
+        statement = "DROP TABLE #{table_};"
+        session.execute statement
+      end
+
+      # TBH I'm not sure if being "defensive" and returning 0/nil in case
+      # the results is_empty? ... I'm leaving (now) the code simple and noisy
+      # if something went wrong in the COUNT.
+      def count
+        table_ = [keyspace, table].compact.join '.'
+        statement = "SELECT COUNT (*) FROM #{table_} ;"
+        result = session.execute(statement)
+        result.first['count']
+      end
+
+      def each
+        table_ = [keyspace, table].compact.join '.'
+        statement = "SELECT * FROM #{table_};"
+        session.execute(statement).each do |data|
+          page = load_page(data) unless data.nil?
+          yield data['uuid'], page
+        end
+      end
+
+      def exists?(page)
+        @semaphore.synchronize do
+          table_ = [keyspace, table].compact.join '.'
+          statement = "SELECT uuid FROM #{table_} WHERE uuid = ? LIMIT 1;"
+          results = session.execute(session.prepare(statement),
+                                    arguments: [uuid(page)])
+          !results.first.nil?
+        end
+      end
+
+      def get(page)
+        @semaphore.synchronize do
+          table_ = [keyspace, table].compact.join '.'
+          statement = "SELECT * FROM #{table_} WHERE uuid = ? LIMIT 1;"
+          results = session.execute(session.prepare(statement),
+                                    arguments: [uuid(page)])
+          data = results.first
+          load_page(data) unless data.nil?
+        end
+      end
+
+      def keyspace!(replication = nil, durable_writes = true)
+        replication ||= "{'class': 'SimpleStrategy', 'replication_factor': '3'}"
+        statement = "CREATE KEYSPACE IF NOT EXISTS #{keyspace} WITH replication = #{replication} AND durable_writes = #{durable_writes};"
+        cluster.connect.execute statement
+      end
+
+      def remove(page)
+        @semaphore.synchronize do
+          table_ = [keyspace, table].compact.join '.'
+          statement = "DELETE FROM #{table_} WHERE uuid = ?;"
+          session.execute(session.prepare(statement),
+                          arguments: [uuid(page)])
+          true
+        end
+      end
+
+      def session
+        @session ||= @cluster.connect(keyspace)
+      end
+
+      def table!(properties = nil)
+        table_ = [keyspace, table].compact.join '.'
+        def_ = "CREATE TABLE IF NOT EXISTS #{table_}
+          (
+            uuid TEXT PRIMARY KEY,
+            url TEXT,
+            code INT,
+            depth INT,
+            referer TEXT,
+            redirect_to TEXT,
+            response_time BIGINT,
+            fetched BOOLEAN,
+            user_data TEXT,
+            fetched_at TIMESTAMP,
+            error TEXT,
+            page BLOB
+          )"
+        props = properties.to_a.join(' AND ')
+        statement = props.empty? ? "#{def_};" : "#{def_} WITH #{props};"
+        session.execute statement
+      end
+
+      def load_page(data)
+        json = Zlib::Inflate.inflate(data['page'])
+        hash = MultiJson.decode(json)
+        page = Page.from_hash(hash)
+        page.fetched_at = 0 if page.fetched_at.nil?
+        page
+      end
+
+      private
+
+      def can_be_converted?(field)
+        !field.nil? && field.is_a?(String) && !field.empty?
+      end
+    end
+  end
+end

+ 30 - 0
polipus-cassandra/polipus-cassandra.gemspec

@@ -0,0 +1,30 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+
+Gem::Specification.new do |spec|
+  spec.name          = 'polipus-cassandra'
+  spec.version       = '0.1.4'
+  spec.authors       = ['Stefano Fontanelli', 'Edoardo Rossi']
+  spec.email         = ['s.fontanelli@gmail.com', 'edoardo@gild.com']
+  spec.summary       = 'Add support for Cassandra in Polipus crawler'
+  spec.description   = 'Add support for Cassandra in Polipus crawler'
+  spec.homepage      = 'https://github.com/stefanofontanelli/polipus-cassandra'
+  spec.license       = 'MIT'
+
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(/^bin\//) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(/^(test|spec|features)\//)
+  spec.require_paths = ['lib']
+
+  spec.add_runtime_dependency 'cassandra-driver', '~> 2.0.1', '>= 2.0.1'
+  spec.add_runtime_dependency 'multi_json', '~> 1.10.0', '>= 1.10.0'
+  spec.add_runtime_dependency 'polipus', '~> 0.3', '>= 0.3.0'
+
+  spec.add_development_dependency 'rake', '~> 10.3'
+  spec.add_development_dependency 'rspec', '~> 3.1.0'
+  spec.add_development_dependency 'flexmock', '~> 1.3'
+  spec.add_development_dependency 'vcr', '~> 2.9.0'
+  spec.add_development_dependency 'webmock', '~> 1.20.0'
+  spec.add_development_dependency 'coveralls'
+end

+ 174 - 0
polipus-cassandra/spec/polipus-cassandra/storage/cassandra_store_spec.rb

@@ -0,0 +1,174 @@
+# encoding: UTF-8
+require 'cassandra'
+require 'logger'
+require 'polipus-cassandra'
+require 'spec_helper'
+
+describe Polipus::Storage::CassandraStore do
+  before(:all)do
+    @logger = Logger.new(STDOUT).tap { |logger| logger.level = Logger::WARN }
+    @cluster = Cassandra.cluster hosts: ['127.0.0.1'], logger: @logger
+    @keyspace = 'polipus_cassandra_test'
+    @table = 'cassandra_store_test'
+    @storage = Polipus::Storage::CassandraStore.new(
+      cluster: @cluster,
+      keyspace: @keyspace,
+      table: @table,
+    )
+
+    @storage.keyspace!
+    @storage.table!
+
+    @storage_without_code_and_body = Polipus::Storage::CassandraStore.new(
+      cluster: @cluster,
+      keyspace: @keyspace,
+      table: @table,
+      except: ['code', 'body']
+    )
+  end
+
+  after(:all) do
+    @storage.clear
+  end
+
+  it 'should store a page' do
+    p = page_factory 'http://www.google.com'
+    uuid = @storage.add p
+    expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
+    p = @storage.get p
+    expect(p.url.to_s).to eq('http://www.google.com')
+    expect(p.body).to eq('<html></html>')
+  end
+
+  it 'should store all the relevant data from the page' do
+    url = "http://www.duckduckgo.com"
+    referer = "http://www.actually.nowhere.com"
+    redirectto = "#{url}/your_super_awesome_results?page=42"
+    now = Time.now.to_i
+    p = page_factory(
+      url,
+      {
+        referer: referer,
+        redirect_to: redirectto,
+        fetched_at: now
+      })
+    uuid = @storage.add p
+    expect(uuid).to eq('3cd657f53c74f22c1a21b420ce3863fd')
+    p = @storage.get p
+
+    expect(p.url.to_s).to eq(url)
+    expect(p.referer.to_s).to eq(referer)
+    expect(p.redirect_to.to_s).to eq(redirectto)
+    expect(p.fetched_at).to eq(now)
+    expect(p.body).to eq('<html></html>')
+
+    # for the sake of the other tests...
+    expect(@storage.remove(p)).to be_truthy
+  end
+
+  it 'should update a page' do
+    p = page_factory 'http://www.google.com', code: 301
+    @storage.add p
+    p = @storage.get p
+    expect(p.code).to eq(301)
+  end
+
+  it 'should iterate over stored pages' do
+    @storage.each do |k, page|
+      expect(k).to eq('ed646a3334ca891fd3467db131372140')
+      expect(page.url.to_s).to eq('http://www.google.com')
+    end
+  end
+
+  it 'should delete a page' do
+    p = page_factory 'http://www.google.com', code: 301
+    @storage.remove p
+    expect(@storage.get(p)).to be_nil
+  end
+
+  it 'should store a page removing a query string from the uuid generation' do
+    p = page_factory 'http://www.asd.com/?asd=lol'
+    p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1'
+    @storage.include_query_string_in_uuid = false
+    @storage.add p
+    expect(@storage.exists?(p_no_query)).to be_truthy
+    @storage.remove p
+  end
+
+  it 'should store a page removing a query string from the uuid generation no ending slash' do
+    p = page_factory 'http://www.asd.com?asd=lol'
+    p_no_query = page_factory 'http://www.asd.com'
+    @storage.include_query_string_in_uuid = false
+    @storage.add p
+    expect(@storage.exists?(p_no_query)).to be_truthy
+    @storage.remove p
+  end
+
+  it 'should store a page with user data associated' do
+    p = page_factory 'http://www.user.com'
+    p.user_data.name = 'Test User Data'
+    @storage.add p
+    expect(@storage.exists?(p)).to be_truthy
+    p = @storage.get(p)
+    expect(p.user_data.name).to eq('Test User Data')
+    @storage.remove p
+  end
+
+  it 'should honor the except parameters' do
+    pag = page_factory 'http://www.user-doo.com'
+    expect(pag.code).to eq(200)
+    expect(pag.body).to eq('<html></html>')
+
+    @storage_without_code_and_body.add(pag)
+    pag = @storage_without_code_and_body.get(pag)
+
+    expect(pag.body).to be_nil
+    expect(pag.code).to eq(0)
+    @storage_without_code_and_body.remove(pag)
+  end
+
+  it 'should return false if a doc not exists' do
+    @storage.include_query_string_in_uuid = false
+    p_other  = page_factory 'http://www.asdrrrr.com'
+    expect(@storage.exists?(p_other)).to be_falsey
+    @storage.add p_other
+    expect(@storage.exists?(p_other)).to be_truthy
+    p_other  = page_factory 'http://www.asdrrrr.com?trk=asd-lol'
+    expect(@storage.exists?(p_other)).to be_truthy
+    @storage.include_query_string_in_uuid = true
+    expect(@storage.exists?(p_other)).to be_falsey
+    @storage.include_query_string_in_uuid = false
+    @storage.remove p_other
+  end
+
+  it 'should set page.fetched_at based on the id creation' do
+    p = page_factory 'http://www.user-doojo.com'
+    @storage.add p
+    expect(p.fetched_at).to be_nil
+    p = @storage.get p
+    expect(p.fetched_at).not_to be_nil
+    @storage.remove p
+  end
+
+  it 'should NOT set page.fetched_at if already present' do
+    p = page_factory 'http://www.user-doojooo.com'
+    p.fetched_at = 10
+    @storage.add p
+    p = @storage.get p
+    expect(p.fetched_at).to be 10
+    @storage.remove p
+  end
+
+  it 'should store two pages and the count will be two' do
+    pages = ['http://www.google.com', 'http://www.duckduckgo.com'].map do |url|
+      page_factory(url).tap do |page|
+        @storage.add(page)
+      end
+    end
+    expect(@storage.count).to be 2
+    pages.each do |page|
+      @storage.remove(page)
+    end
+    expect(@storage.count).to be 0
+  end
+end

+ 44 - 0
polipus-cassandra/spec/spec_helper.rb

@@ -0,0 +1,44 @@
+# Require this file using `require "spec_helper"`
+# to ensure that it is only loaded once.
+#
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+require 'digest/md5'
+require 'coveralls'
+require 'vcr'
+require 'webmock/rspec'
+
+Coveralls.wear!
+
+VCR.configure do |c|
+  c.cassette_library_dir = "#{File.dirname(__FILE__)}/cassettes"
+  c.hook_into :webmock
+end
+
+require 'polipus'
+
+RSpec.configure do |config|
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = 'random'
+  config.mock_with :flexmock
+  config.around(:each) do |example|
+    t = Time.now
+    print example.metadata[:full_description]
+    VCR.use_cassette(Digest::MD5.hexdigest(example.metadata[:full_description])) do
+      example.run
+      puts " [#{Time.now - t}s]"
+    end
+  end
+  config.before(:each) { Polipus::SignalHandler.disable }
+end
+
+def page_factory(url, params = {})
+  params[:code] = 200 unless params.has_key?(:code)
+  params[:body] = '<html></html>' unless params.has_key?(:body)
+  Polipus::Page.new url, params
+end

+ 35 - 0
polipus-elasticsearch/.gitignore

@@ -0,0 +1,35 @@
+*.gem
+*.rbc
+/.config
+/coverage/
+/InstalledFiles
+/pkg/
+/spec/reports/
+/test/tmp/
+/test/version_tmp/
+/tmp/
+
+## Specific to RubyMotion:
+.dat*
+.repl_history
+build/
+
+## Documentation cache and generated files:
+/.yardoc/
+/_yardoc/
+/doc/
+/rdoc/
+
+## Environment normalisation:
+/.bundle/
+/vendor/bundle
+/lib/bundler/man/
+
+# for a library or gem, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+Gemfile.lock
+# .ruby-version
+# .ruby-gemset
+
+# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
+.rvmrc

+ 2 - 0
polipus-elasticsearch/Gemfile

@@ -0,0 +1,2 @@
+source 'https://rubygems.org'
+gemspec

+ 22 - 0
polipus-elasticsearch/LICENSE.txt

@@ -0,0 +1,22 @@
+Copyright (c) 2015 Stefano Fontanelli
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 29 - 0
polipus-elasticsearch/README.md

@@ -0,0 +1,29 @@
+# Polipus: addons for ElasticSearch
+
+TODO: Write a gem description
+
+## Installation
+
+Add this line to your application's Gemfile:
+
+    gem 'polipus-elasticsearch'
+
+And then execute:
+
+    $ bundle
+
+Or install it yourself as:
+
+    $ gem install polipus-elasticsearch
+
+## Usage
+
+TODO: Write usage instructions here
+
+## Contributing
+
+1. Fork it ( http://github.com/<my-github-username>/polipus-elasticsearch/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

+ 2 - 0
polipus-elasticsearch/Rakefile

@@ -0,0 +1,2 @@
+# coding: utf-8
+require 'bundler/gem_tasks'

+ 3 - 0
polipus-elasticsearch/lib/polipus-elasticsearch.rb

@@ -0,0 +1,3 @@
+# encoding: UTF-8
+require 'polipus-elasticsearch/index/page'
+require 'polipus-elasticsearch/storage/elasticsearch_store'

+ 169 - 0
polipus-elasticsearch/lib/polipus-elasticsearch/index/page.rb

@@ -0,0 +1,169 @@
+require 'elasticsearch/model'
+
+ENV['POLIPUS_ELASTICSEARCH_INDEX_SHARDS']    ||= '1'
+ENV['POLIPUS_ELASTICSEARCH_INDEX_REPLICAS']  ||= '0'
+
+module Polipus
+  module ElasticSearch
+    class Page
+      include Elasticsearch::Model
+
+      DEFAULT_INDEX_NAME = 'polipus-pages'
+      document_type 'polipus_page'
+      index_name DEFAULT_INDEX_NAME
+
+      settings(
+        index: {
+          number_of_shards: ENV['POLIPUS_ELASTICSEARCH_INDEX_SHARDS'].to_i,
+          number_of_replicas: ENV['POLIPUS_ELASTICSEARCH_INDEX_REPLICAS'].to_i
+        }
+      )
+      mapping(_all: { enabled: false }) do
+        indexes(
+          :id,
+          index: :not_analyzed
+        )
+        indexes(
+          :body,
+          type: :string
+        )
+        indexes(
+          :code,
+          type: :integer
+        )
+        indexes(
+          :depth,
+          type: :integer
+        )
+        indexes(
+          :error,
+          type: :string
+        )
+        indexes(
+          :fetched,
+          type: :boolean
+        )
+        indexes(
+          :fetched_at,
+          type: :integer
+        )
+        indexes(
+          :headers,
+          type: :string
+        )
+        indexes(
+          :links,
+          type: :string
+        )
+        indexes(
+          :redirect_to,
+          type: :string
+        )
+        indexes(
+          :referer,
+          type: :string
+        )
+        indexes(
+          :response_time,
+          type: :integer
+        )
+        indexes(
+          :url,
+          type: :string
+        )
+        indexes(
+          :user_data,
+          type: :string
+        )
+      end
+
+      def self.client
+        __elasticsearch__.client
+      end
+
+      def self.count
+        client.count(index: index_name, type: document_type)['count'].to_i
+      end
+
+      def self.create_index!(name)
+        index_name(name) unless name.nil?
+        __elasticsearch__.create_index!(index: index_name)
+      end
+
+      def self.clear_index!
+        client.delete_by_query(
+          index: index_name,
+           body: { query: { match_all: {} } }
+        )
+      end
+
+      def self.delete_index!
+        client.indices.delete(index: index_name)
+      end
+
+      def self.exists?(id)
+        client.exists?(
+          index: index_name,
+          type: document_type,
+          id: id
+        )
+      end
+
+      def self.get(id)
+        return unless exists?(id)
+        client.get_source(
+          index: index_name,
+          type: document_type,
+          id: id
+        )
+      end
+
+      def self.index_exists?
+        client.indices.exists?(index: index_name)
+      end
+
+      def self.process_document(obj)
+        doc = { '_type' => document_type }
+        properties.each do |p|
+          doc[p.to_s] = obj.respond_to?(p.to_s) ? obj.send(p.to_s) : obj[p.to_s]
+        end
+        doc.reject { |_, value| value.nil? }
+      end
+
+      def self.properties
+        mapping.to_hash[document_type.to_sym][:properties].keys.map { |k| k.to_s }
+      end
+
+      def self.remove(id, refresh = false)
+        return unless exists?(id)
+        client.delete(
+          index: index_name,
+          type: document_type,
+          id: id,
+          refresh: refresh,
+          version: Time.now.to_i,
+          version_type: :external
+        )
+      end
+
+      def self.setup(client_, index_name = DEFAULT_INDEX_NAME)
+        __elasticsearch__.client = client_
+        self.index_name(index_name)
+      end
+
+      def self.store(document, refresh = false)
+        document = process_document(document)
+        client.index(
+          index: index_name,
+          type: document_type,
+          id: document['id'],
+          body: document,
+          refresh: refresh,
+          version: document['fetched_at'].to_i,
+          version_type: :external
+        )
+        document['id']
+      end
+    end
+  end
+end

+ 104 - 0
polipus-elasticsearch/lib/polipus-elasticsearch/storage/elasticsearch_store.rb

@@ -0,0 +1,104 @@
+# encoding: UTF-8
+require 'base64'
+require 'multi_json'
+require 'polipus'
+require 'polipus-elasticsearch'
+
+module Polipus
+  module Storage
+    class ElasticSearchStore < Base
+      BINARY_FIELDS = %w(body headers user_data)
+      DEFAULT_INDEX = Polipus::ElasticSearch::Page
+
+      attr_accessor :index, :index_name, :except, :compress, :semaphore, :refresh
+
+      def initialize(client, options = {})
+        @index = options[:index] || options['index'] || DEFAULT_INDEX
+        @index_name = options[:index_name] || options['index_name']
+        @except = options[:except] || options['except'] || []
+        @compress = options[:compress] || options['compress']
+        @semaphore = Mutex.new
+        @refresh = options[:refresh] || options['refresh'] || true
+        index.setup(client, index_name)
+        index.create_index!(index_name) unless index.index_exists?
+      end
+
+      def add(page)
+        semaphore.synchronize do
+          obj = page.to_hash
+          Array(except).each { |field| obj.delete(field.to_s) }
+          BINARY_FIELDS.each do |field|
+            next if obj[field.to_s].nil? || obj[field.to_s].empty?
+            obj[field.to_s] = MultiJson.encode(obj[field.to_s]) if field.to_s == 'user_data'
+            obj[field.to_s] = Base64.encode64(obj[field.to_s])
+          end
+          obj['id'] = uuid(page)
+          obj['fetched_at'] = obj['fetched_at'].to_i
+          begin
+            index.store(obj, refresh)
+          rescue Elasticsearch::Transport::Transport::Errors::Conflict => ex
+            # you're trying to store an old version.
+          end
+        end
+      end
+
+      def clear
+        index.clear_index! if index.index_exists?
+      end
+
+      def count
+        index.count
+      end
+
+      def drop
+        index.delete_index! if index.index_exists?
+      end
+
+      def each
+        # This method is implemented only for testing purposes
+        response = index.client.search(
+          index: index_name,
+          body: {
+            query: { match_all: {} },
+            from: 0,
+            size: 25
+          }
+        )
+        response['hits']['hits'].each do |data|
+          page = load_page(data['_source'])
+          yield uuid(page), page
+        end
+      end
+
+      def exists?(page)
+        @semaphore.synchronize do
+          index.exists?(uuid(page))
+        end
+      end
+
+      def get(page)
+        @semaphore.synchronize do
+          load_page(index.get(uuid(page)))
+        end
+      end
+
+      def remove(page)
+        @semaphore.synchronize do
+          index.remove(uuid(page), refresh)
+        end
+      end
+
+      def load_page(data)
+        return nil if data.nil?
+        BINARY_FIELDS.each do |field|
+          next if data[field.to_s].nil? || data[field.to_s].empty?
+          data[field.to_s] = Base64.decode64(data[field.to_s])
+          data[field.to_s] = MultiJson.decode(data[field.to_s]) if field.to_s == 'user_data'
+        end
+        page = Page.from_hash(data)
+        page.fetched_at ||= 0
+        page
+      end
+    end
+  end
+end

+ 27 - 0
polipus-elasticsearch/polipus-elasticsearch.gemspec

@@ -0,0 +1,27 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+
+Gem::Specification.new do |spec|
+  spec.name          = 'polipus-elasticsearch'
+  spec.version       = '0.0.4'
+  spec.authors       = ['Stefano Fontanelli']
+  spec.email         = ['s.fontanelli@gmail.com']
+  spec.summary       = 'Add support for ElasticSearch in Polipus crawler'
+  spec.description   = 'Add support for ElasticSearch in Polipus crawler'
+  spec.homepage      = 'https://github.com/stefanofontanelli/polipus-elasticsearch'
+  spec.license       = 'MIT'
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(/^bin\//) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(/^(test|spec|features)\//)
+  spec.require_paths = ['lib']
+  spec.add_runtime_dependency 'elasticsearch', '~> 1.0.4'
+  spec.add_runtime_dependency 'elasticsearch-model', '~> 0.1.4'
+  spec.add_runtime_dependency 'polipus', '~> 0.3', '>= 0.3.0'
+  spec.add_development_dependency 'rake', '~> 10.3'
+  spec.add_development_dependency 'rspec', '~> 3.1.0'
+  spec.add_development_dependency 'flexmock', '~> 1.3'
+  spec.add_development_dependency 'vcr', '~> 2.9.0'
+  spec.add_development_dependency 'webmock', '~> 1.20.0'
+  spec.add_development_dependency 'coveralls'
+end

+ 175 - 0
polipus-elasticsearch/spec/polipus-elasticsearch/storage/elasticsearch_store_spec.rb

@@ -0,0 +1,175 @@
+# encoding: UTF-8
+require 'logger'
+require 'polipus-elasticsearch'
+require 'spec_helper'
+
+describe Polipus::Storage::ElasticSearchStore do
+  before(:each)do
+    @logger = Logger.new(STDOUT)
+    @client = Elasticsearch::Client.new(host: '127.0.0.1', logger: @logger)
+    @client.transport.logger.level = Logger::INFO
+    @index_name = 'polipus_elasticsearch_test'
+    @storage = Polipus::Storage::ElasticSearchStore.new(
+      @client,
+      index_name: @index_name,
+      refresh: true
+    )
+    @storage_without_code_and_body = Polipus::Storage::ElasticSearchStore.new(
+      @client,
+      index_name: @index_name,
+      except: ['code', 'body'],
+      refresh: true
+    )
+  end
+
+  after(:each) do
+    @storage.drop
+  end
+
+  it 'should store a page' do
+    p = page_factory 'http://www.google.com'
+    uuid = @storage.add(p)
+    expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
+    p = @storage.get(p)
+    expect(p).not_to be_nil
+    expect(p.url.to_s).to eq('http://www.google.com')
+    expect(p.body).to eq('<html></html>')
+    @storage.remove(p)
+    p = @storage.get(p)
+    expect(p).to be_nil
+  end
+
+  it 'should store all the relevant data from the page' do
+    url = "http://www.duckduckgo.com"
+    referer = "http://www.actually.nowhere.com"
+    redirectto = "#{url}/your_super_awesome_results?page=42"
+    now = Time.now.to_i
+    p = page_factory(
+      url,
+      {
+        referer: referer,
+        redirect_to: redirectto,
+        fetched_at: now
+      })
+    uuid = @storage.add p
+    expect(uuid).to eq('3cd657f53c74f22c1a21b420ce3863fd')
+    p = @storage.get p
+
+    expect(p.url.to_s).to eq(url)
+    expect(p.referer.to_s).to eq(referer)
+    expect(p.redirect_to.to_s).to eq(redirectto)
+    expect(p.fetched_at).to eq(now)
+    expect(p.body).to eq('<html></html>')
+
+    # for the sake of the other tests...
+    expect(@storage.remove(p)).to be_truthy
+  end
+
+  it 'should update a page' do
+    p = page_factory 'http://www.google.com', code: 301
+    @storage.add p
+    p = @storage.get p
+    expect(p.code).to eq(301)
+  end
+
+  it 'should iterate over stored pages' do
+    p = page_factory('http://www.google.com')
+    @storage.add(p)
+    @storage.each do |k, page|
+      expect(k).to eq('ed646a3334ca891fd3467db131372140')
+      expect(page.url.to_s).to eq('http://www.google.com')
+    end
+  end
+
+  it 'should delete a page' do
+    p = page_factory 'http://www.google.com', code: 301
+    @storage.remove p
+    expect(@storage.get(p)).to be_nil
+  end
+
+  it 'should store a page removing a query string from the uuid generation' do
+    p = page_factory 'http://www.asd.com/?asd=lol'
+    p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1'
+    @storage.include_query_string_in_uuid = false
+    @storage.add p
+    expect(@storage.exists?(p_no_query)).to be_truthy
+    @storage.remove p
+  end
+
+  it 'should store a page removing a query string from the uuid generation no ending slash' do
+    p = page_factory 'http://www.asd.com?asd=lol'
+    p_no_query = page_factory 'http://www.asd.com'
+    @storage.include_query_string_in_uuid = false
+    @storage.add p
+    expect(@storage.exists?(p_no_query)).to be_truthy
+    @storage.remove p
+  end
+
+  it 'should store a page with user data associated' do
+    p = page_factory 'http://www.user.com'
+    p.user_data.name = 'Test User Data'
+    @storage.add p
+    expect(@storage.exists?(p)).to be_truthy
+    p = @storage.get(p)
+    expect(p.user_data.name).to eq('Test User Data')
+    @storage.remove p
+  end
+
+  it 'should honor the except parameters' do
+    pag = page_factory 'http://www.user-doo.com'
+    expect(pag.code).to eq(200)
+    expect(pag.body).to eq('<html></html>')
+
+    @storage_without_code_and_body.add(pag)
+    pag = @storage_without_code_and_body.get(pag)
+
+    expect(pag.body).to be_nil
+    expect(pag.code).to eq(0)
+    @storage_without_code_and_body.remove(pag)
+  end
+
+  it 'should return false if a doc not exists' do
+    @storage.include_query_string_in_uuid = false
+    p_other  = page_factory 'http://www.asdrrrr.com'
+    expect(@storage.exists?(p_other)).to be_falsey
+    @storage.add p_other
+    expect(@storage.exists?(p_other)).to be_truthy
+    p_other  = page_factory 'http://www.asdrrrr.com?trk=asd-lol'
+    expect(@storage.exists?(p_other)).to be_truthy
+    @storage.include_query_string_in_uuid = true
+    expect(@storage.exists?(p_other)).to be_falsey
+    @storage.include_query_string_in_uuid = false
+    @storage.remove p_other
+  end
+
+  it 'should set page.fetched_at based on the id creation' do
+    p = page_factory 'http://www.user-doojo.com'
+    @storage.add p
+    expect(p.fetched_at).not_to be_nil
+    p = @storage.get p
+    expect(p.fetched_at).not_to be_nil
+    @storage.remove p
+  end
+
+  it 'should NOT set page.fetched_at if already present' do
+    p = page_factory 'http://www.user-doojooo.com'
+    p.fetched_at = 10
+    @storage.add p
+    p = @storage.get p
+    expect(p.fetched_at).to be 10
+    @storage.remove p
+  end
+
+  it 'should store two pages and the count will be two' do
+    pages = ['http://www.google.com', 'http://www.duckduckgo.com'].map do |url|
+      page_factory(url).tap do |page|
+        @storage.add(page)
+      end
+    end
+    expect(@storage.count).to be 2
+    pages.each do |page|
+      @storage.remove(page)
+    end
+    expect(@storage.count).to be 0
+  end
+end

+ 51 - 0
polipus-elasticsearch/spec/spec_helper.rb

@@ -0,0 +1,51 @@
+# Require this file using `require "spec_helper"`
+# to ensure that it is only loaded once.
+#
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+require 'digest/md5'
+require 'coveralls'
+require 'vcr'
+require 'webmock/rspec'
+
+Coveralls.wear!
+
+require 'polipus'
+
+VCR.configure do |c|
+  c.cassette_library_dir = "#{File.dirname(__FILE__)}/cassettes"
+  c.hook_into :webmock
+  c.allow_http_connections_when_no_cassette = true
+  c.ignore_localhost = true
+end
+
+RSpec.configure do |config|
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = 'random'
+  config.mock_with :flexmock
+  config.around(:each) do |example|
+    t = Time.now
+    print example.metadata[:full_description]
+    VCR.use_cassette(
+      Digest::MD5.hexdigest(example.metadata[:full_description]),
+      record: :all
+    ) do
+      example.run
+    end
+    puts " [#{Time.now - t}s]"
+  end
+  config.before(:each) { Polipus::SignalHandler.disable }
+end
+
+def page_factory(url, params = {})
+  params[:code] ||= 200 unless params.has_key?(:code)
+  params[:body] = '<html></html>' unless params.has_key?(:body)
+  params[:fetched_at] = Time.now.to_i
+  sleep(1)
+  Polipus::Page.new(url, params)
+end

+ 17 - 0
polipus-storage-mysql/.gitignore

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

+ 11 - 0
polipus-storage-mysql/.rubocop.yml

@@ -0,0 +1,11 @@
+Style/Documentation:
+  Enabled: false
+
+Style/RegexpLiteral:
+  Enabled: false
+
+Style/MethodLength:
+  Enabled: false
+
+Style/LineLength:
+  Enabled: false

+ 4 - 0
polipus-storage-mysql/Gemfile

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+
+# Specify your gem's dependencies in polipus-storage-mysql.gemspec
+gemspec

+ 22 - 0
polipus-storage-mysql/LICENSE.txt

@@ -0,0 +1,22 @@
+Copyright (c) 2014 Francesco Laurita
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 44 - 0
polipus-storage-mysql/README.md

@@ -0,0 +1,44 @@
+# Polipus::Storage::Mysql
+
+MySQL Storage driver for [Polipus::Crawler](https://github.com/taganaka/polipus)
+
+## Installation
+
+Add this line to your application's Gemfile:
+
+    gem 'polipus-storage-mysql'
+
+And then execute:
+
+    $ bundle
+
+Or install it yourself as:
+
+    $ gem install polipus-storage-mysql
+
+## Usage
+
+```ruby
+require 'polipus'
+require 'polipus/storage/mysql_store'
+mysql_storage = Polipus::Storage::mysql_store(mysql_options, table_name)
+Polipus.crawler('rubygems','http://rubygems.org/', storage: mysql_store) do |crawler|
+  # In-place page processing
+  crawler.on_page_downloaded do |page|
+    # A nokogiri object
+    puts "Page title: '#{page.doc.css('title').text}' Page url: #{page.url}"
+  end
+end
+```
+
+## MySQL options
+
+MySQL options are passed directly to the mysql2 driver: (https://github.com/brianmario/mysql2)
+
+## Contributing
+
+1. Fork it ( http://github.com/taganaka/polipus-storage-mysql/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

+ 2 - 0
polipus-storage-mysql/Rakefile

@@ -0,0 +1,2 @@
+# coding: utf-8
+require 'bundler/gem_tasks'

+ 126 - 0
polipus-storage-mysql/lib/polipus/storage/mysql_store.rb

@@ -0,0 +1,126 @@
+# coding: utf-8
+require 'polipus/storage'
+require 'polipus/page'
+require 'mysql2'
+require 'thread'
+
+module Polipus
+  module Storage
+    def self.mysql_store(mysql_options = {}, table_name = 'pages')
+      self::MysqlStore.new(mysql_options.merge(table_name: table_name))
+    end
+
+    class MysqlStore < Base
+      def initialize(options = {})
+        @tbl = options.delete :table_name
+        @my  = Mysql2::Client.new(options)
+        @mutex = Mutex.new
+        setup
+      end
+
+      def add(page)
+        @mutex.synchronize do
+          @my.query(page_to_sql(page))
+          uuid(page)
+        end
+      end
+
+      def exists?(page)
+        @mutex.synchronize do
+          @my.query("SELECT
+            EXISTS (SELECT 1 FROM #{@tbl}
+              WHERE uuid = '#{@my.escape(uuid(page))}') AS CNT")
+          .first['CNT'] == 1
+        end
+      end
+
+      def get(page)
+        @mutex.synchronize do
+          load_page(
+           @my.query("SELECT * FROM #{@tbl} WHERE uuid = '#{@my.escape(uuid(page))}' LIMIT 1", cast_booleans: true)
+          .first
+          )
+        end
+      end
+
+      def remove(page)
+        @mutex.synchronize do
+          @my.query("DELETE FROM #{@tbl} WHERE uuid = '#{@my.escape(uuid(page))}'")
+        end
+      end
+
+      def count
+        @mutex.synchronize do
+          @my.query("SELECT COUNT(*) AS CNT FROM #{@tbl}").first['CNT'].to_i
+        end
+      end
+
+      def each
+        @my.query("SELECT * FROM #{@tbl}").each do |row|
+          yield row['uuid'], load_page(row)
+        end
+      end
+
+      def clear
+        @mutex.synchronize do
+          @my.query("DELETE FROM #{@tbl}")
+        end
+      end
+
+      private
+
+      def setup
+        create_table = %Q(
+          CREATE TABLE IF NOT EXISTS #{@tbl} (
+            uuid          varchar(32) PRIMARY KEY,
+            url           varchar(255),
+            headers       blob,
+            body          blob,
+            links         blob,
+            code          int,
+            depth         int,
+            referer       varchar(255),
+            redirect_to   varchar(255),
+            response_time int,
+            fetched       boolean,
+            user_data     blob,
+            fetched_at    int,
+            error         varchar(255)
+          )
+        )
+        @my.query(create_table)
+      end
+
+      def page_to_sql(page)
+        %Q(
+          INSERT INTO #{@tbl}
+            VALUES (
+              '#{uuid(page)}',
+              '#{@my.escape(page.url.to_s)}',
+              '#{@my.escape(Marshal.dump(page.headers))}',
+              '#{@my.escape(page.body)}',
+              '#{@my.escape(Marshal.dump(page.links))}',
+              #{page.code.to_i},
+              #{page.depth.to_i},
+              '#{@my.escape(page.referer.to_s)}',
+              '#{@my.escape(page.redirect_to.to_s)}',
+              #{page.response_time.to_i},
+              #{page.fetched?},
+              '#{@my.escape(Marshal.dump(page.user_data))}',
+              #{page.fetched_at.to_i},
+              '#{@my.escape(page.error.to_s)}'
+            )
+            ON DUPLICATE KEY UPDATE
+              fetched_at = UNIX_TIMESTAMP()
+        )
+      end
+
+      def load_page(hash)
+        %w(links user_data).each do |f|
+          hash[f] = Marshal.load(hash[f]) unless hash[f].nil?
+        end
+        Page.from_hash(hash)
+      end
+    end
+  end
+end

+ 26 - 0
polipus-storage-mysql/polipus-storage-mysql.gemspec

@@ -0,0 +1,26 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+
+Gem::Specification.new do |spec|
+  spec.name          = 'polipus-storage-mysql'
+  spec.version       = '0.0.1'
+  spec.authors       = ['Francesco Laurita']
+  spec.email         = ['francesco.laurita@gmail.com']
+  spec.summary       = %q(TODO: Write a short summary. Required.)
+  spec.description   = %q(TODO: Write a longer description. Optional.)
+  spec.homepage      = ''
+  spec.license       = 'MIT'
+
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ['lib']
+
+  spec.add_runtime_dependency 'polipus', '~> 0.3', '>= 0.3.0'
+  spec.add_runtime_dependency 'mysql2', '~> 0.3', '>= 0.3.16'
+
+  spec.add_development_dependency 'rspec', '~> 2.99', '>= 2.99.0'
+  spec.add_development_dependency 'bundler', '~> 1.5'
+  spec.add_development_dependency 'rake'
+end

+ 129 - 0
polipus-storage-mysql/spec/mysql_storage_spec.rb

@@ -0,0 +1,129 @@
+# coding: utf-8
+require 'spec_helper'
+require 'polipus/storage/mysql_store'
+
+describe Polipus::Storage::MysqlStore do
+  let(:test_db_name) { 'polipus_mysql_store_spec' }
+  let(:options) do
+    {
+      host: 'localhost',
+      username: 'root',
+      password: '',
+      database: test_db_name,
+      table_name: 'rspec_pages'
+    }
+  end
+
+  let(:my)do
+    o = options.dup
+    o.delete :database
+    Mysql2::Client.new o
+  end
+  let(:db) { Mysql2::Client.new options }
+
+  let(:page)do
+    Polipus::Page.new 'http://www.google.com/',
+                      body: '<html>
+                        <body>
+                          <a href="/a/1">1</a>
+                          <a href="/a/2">2</a>
+                        </body>
+                      </html>',
+                      code: 201,
+                      depth: 1,
+                      referer: 'http://www.google.com/1',
+                      response_time: 1,
+                      fetched: true,
+                      fetched_at: Time.now,
+                      error: 'an error',
+                      headers: { 'content-type' => ['text/html'] }
+  end
+
+  let(:storage) { Polipus::Storage.mysql_store(options, options[:table_name]) }
+
+  before(:each) do
+    my.query("CREATE DATABASE IF NOT EXISTS #{test_db_name}")
+  end
+
+  after(:each) do
+    my.query("DROP DATABASE #{test_db_name}")
+  end
+
+  context 'CREATE' do
+    it 'should store a page' do
+      page.user_data.a = 1
+      storage.add(page).should eq Digest::MD5.hexdigest(page.url.to_s)
+      storage.count.should be 1
+      storage.exists?(page).should be true
+    end
+  end
+
+  context 'DELETE' do
+    let(:filled_storage) do
+      storage.add page
+      storage
+    end
+
+    it 'should delete a page' do
+      filled_storage.remove page
+      filled_storage.exists?(page).should be false
+      filled_storage.count.should be 0
+    end
+
+    it 'should empty the storage' do
+      2.times do |i|
+        p = page.to_hash
+        p['url'] = "#{p['url']}/#{i}"
+        storage.add Polipus::Page.from_hash(p)
+      end
+      filled_storage.count.should be 3
+      filled_storage.clear
+      filled_storage.count.should be 0
+    end
+
+  end
+
+  context 'UPDATE' do
+    let(:filled_storage) do
+      storage.add page
+      storage
+    end
+
+    it 'should update a page' do
+      filled_storage.add page
+    end
+  end
+
+  context 'SELECT' do
+    let(:filled_storage) do
+      storage.add page
+      storage
+    end
+
+    it 'should fetch a page' do
+      p = filled_storage.get page
+      expect(p).to_not be nil
+      expect(p).to be_a Polipus::Page
+      expect(p.url.to_s).to eq 'http://www.google.com/'
+      expect(p.links.count).to be 2
+      expect(p.headers['content-type']).to eq ['text/html']
+      expect(p.fetched_at).to be > 0
+    end
+
+  end
+
+  context 'CURSOR' do
+    it 'should iterate over pages' do
+      10.times do |i|
+        p = page.to_hash
+        p['url'] = "#{p['url']}/#{i}"
+        storage.add Polipus::Page.from_hash(p)
+      end
+      storage.count.should be 10
+      i = 0
+      storage.each { i += 1 }
+      expect(i).to be 10
+    end
+  end
+
+end

+ 18 - 0
polipus-storage-mysql/spec/spec_helper.rb

@@ -0,0 +1,18 @@
+# coding: utf-8
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# Require this file using `require "spec_helper"` to ensure that it is only
+# loaded once.
+#
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = 'random'
+end

+ 17 - 0
polipus-storage-s3/.gitignore

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

+ 4 - 0
polipus-storage-s3/Gemfile

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+
+# Specify your gem's dependencies in polipus-storage-s3.gemspec
+gemspec

+ 22 - 0
polipus-storage-s3/LICENSE.txt

@@ -0,0 +1,22 @@
+Copyright (c) 2014 Francesco Laurita
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 29 - 0
polipus-storage-s3/README.md

@@ -0,0 +1,29 @@
+# Polipus::Storage::S3
+
+TODO: Write a gem description
+
+## Installation
+
+Add this line to your application's Gemfile:
+
+    gem 'polipus-storage-s3'
+
+And then execute:
+
+    $ bundle
+
+Or install it yourself as:
+
+    $ gem install polipus-storage-s3
+
+## Usage
+
+TODO: Write usage instructions here
+
+## Contributing
+
+1. Fork it ( http://github.com/<my-github-username>/polipus-storage-s3/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

+ 2 - 0
polipus-storage-s3/Rakefile

@@ -0,0 +1,2 @@
+# coding: utf-8
+require 'bundler/gem_tasks'

+ 0 - 0
polipus-storage-s3/lib/polipus/storage/s3_store.rb


+ 24 - 0
polipus-storage-s3/polipus-storage-s3.gemspec

@@ -0,0 +1,24 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+
+Gem::Specification.new do |spec|
+  spec.name          = 'polipus-storage-s3'
+  spec.version       = '0.0.1'
+  spec.authors       = ['Francesco Laurita']
+  spec.email         = ['francesco.laurita@gmail.com']
+  spec.summary       = %q(TODO: Write a short summary. Required.)
+  spec.description   = %q(TODO: Write a longer description. Optional.)
+  spec.homepage      = ''
+  spec.license       = 'MIT'
+
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(/^bin\//) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(/^(test|spec|features)\//)
+  spec.require_paths = ['lib']
+
+  spec.add_runtime_dependency 'polipus', '~> 0.3', '>= 0.3.0'
+  spec.add_runtime_dependency 'aws-s3', '~> 0.6', '>= 0.6.3'
+
+  spec.add_development_dependency 'rake'
+end

+ 18 - 0
polipus-storage-s3/spec/spec_helper.rb

@@ -0,0 +1,18 @@
+# coding: utf-8
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# Require this file using `require "spec_helper"` to ensure that it is only
+# loaded once.
+#
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = 'random'
+end