settings.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. # -*- coding: utf-8 -*-
  2. # Scrapy settings for get_contact_info project
  3. # python3 -m scrapy runspider spiders/mitspider.py -o emails.json -L INFO
  4. # For simplicity, this file contains only settings considered important or
  5. # commonly used. You can find more settings consulting the documentation:
  6. #
  7. # http://doc.scrapy.org/en/latest/topics/settings.html
  8. # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
  9. # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
  10. BOT_NAME = 'get_contact_info'
  11. SPIDER_MODULES = ['get_contact_info.spiders']
  12. NEWSPIDER_MODULE = 'get_contact_info.spiders'
  13. HTTPERROR_ALLOW_ALL = True
  14. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  15. #USER_AGENT = 'get_contact_info (+http://www.yourdomain.com)'
  16. # Obey robots.txt rules
  17. ROBOTSTXT_OBEY = False
  18. # Set the maximum depth of a site crawling
  19. DEPTH_LIMIT = 20
  20. # Set the site crawling algorithm
  21. # if zero (default), no priority adjustment is made from depth
  22. # a positive value will decrease the priority, i.e. higher depth requests will be processed later
  23. # ; this is commonly used when doing breadth-first crawls (BFO).
  24. # a negative value will increase priority, i.e., higher depth requests will be processed sooner (DFO)
  25. # DEPTH_PRIORITY = 1
  26. # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
  27. # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
  28. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  29. CONCURRENT_REQUESTS = 32
  30. # Configure a delay for requests for the same website (default: 0)
  31. # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
  32. # See also autothrottle settings and docs
  33. DOWNLOAD_DELAY = 3
  34. # The download delay setting will honor only one of:
  35. CONCURRENT_REQUESTS_PER_DOMAIN = 16
  36. CONCURRENT_REQUESTS_PER_IP = 16
  37. # Disable cookies (enabled by default)
  38. COOKIES_ENABLED = True
  39. # Disable Telnet Console (enabled by default)
  40. TELNETCONSOLE_ENABLED = True
  41. # Override the default request headers:
  42. #DEFAULT_REQUEST_HEADERS = {
  43. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  44. # 'Accept-Language': 'en',
  45. #}
  46. # Enable or disable spider middlewares
  47. # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
  48. #SPIDER_MIDDLEWARES = {
  49. # 'get_contact_info.middlewares.GetContactInfoSpiderMiddleware': 543,
  50. #}
  51. # Enable or disable downloader middlewares
  52. # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
  53. #DOWNLOADER_MIDDLEWARES = {
  54. # 'get_contact_info.middlewares.MyCustomDownloaderMiddleware': 543,
  55. #}
  56. # Enable or disable extensions
  57. # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
  58. #EXTENSIONS = {
  59. # 'scrapy.extensions.telnet.TelnetConsole': None,
  60. #}
  61. # Configure item pipelines
  62. # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
  63. #ITEM_PIPELINES = {
  64. # 'get_contact_info.pipelines.GetContactInfoPipeline': 300,
  65. #}
  66. # Enable and configure the AutoThrottle extension (disabled by default)
  67. # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
  68. AUTOTHROTTLE_ENABLED = True
  69. # The initial download delay
  70. AUTOTHROTTLE_START_DELAY = 5
  71. # The maximum download delay to be set in case of high latencies
  72. AUTOTHROTTLE_MAX_DELAY = 60
  73. # The average number of requests Scrapy should be sending in parallel to
  74. # each remote server
  75. AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  76. # Enable showing throttling stats for every response received:
  77. AUTOTHROTTLE_DEBUG = True
  78. # Enable and configure HTTP caching (disabled by default)
  79. # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  80. HTTPCACHE_ENABLED = True
  81. HTTPCACHE_EXPIRATION_SECS = 30
  82. HTTPCACHE_DIR = 'httpcache'
  83. HTTPCACHE_IGNORE_HTTP_CODES = []
  84. HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'