Commit 82746a2b authored by neodarz's avatar neodarz

Initial commit

parents
Pipeline #634 failed with stages
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
Simple search engine - but you can search nothing for the moment
# Crawling
For now there is an example spider with neodarz website.
For testing it just run:
```
scrapy crawl scrape -o out.json
```
# -*- coding: utf-8 -*-
import scrapy
class NeodarznetItem(scrapy.Item):
pass
# -*- coding: utf-8 -*-
class NeodarznetPipeline(object):
def process_time(self, item, spider):
return item
# -*- coding: utf-8 -*-
BOT_NAME = 'neodarznet'
SPIDER_MODULES = ['crawler.neodarznet.spiders']
NEWSPIDER_MODULE = 'crawler.neodarznet.spiders'
ROBOTSTXT_OBEY = True
DEPTH_LIMIT = 0
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import Selector
class ScrapSpider(CrawlSpider):
name = "scrape"
allow_domains = ['neodarz.net']
start_urls = [
'https://neodarz.net/',
]
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True,
allow_domains="neodarz.net",
deny=".*\.neodarz\.net.*"
),
follow=True,
callback="parse_items"
)
]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
def parse_items(self, response):
sel = Selector(response)
yield {
'url': response.url,
'title': response.css('title::text').extract_first(),
'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip()
}
[settings]
default = crawler.neodarznet.settings
[deploy]
project = crawler.neodarznet
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment