У меня есть простой сканер/паук Python, который ищет указанный текст на сайте, который я предоставляю. Но на некоторых сайтах ползает нормально по 2-4 сек пока не вылезет ошибка.
Код до сих пор:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import requests, pyquery, urlparse
try:
range = xrange
except NameError:
pass
def crawl(seed, depth, terms):
crawled = set()
uris = set([seed])
for level in range(depth):
new_uris = set()
for uri in uris:
if uri in crawled:
continue
crawled.add(uri)
# Get URI contents
try:
content = requests.get(uri).content
except:
continue
# Look for the terms
found = 0
for term in terms:
if term in content:
found += 1
if found > 0:
yield (uri, found, level + 1)
# Find child URIs, and add them to the new_uris set
dom = pyquery.PyQuery(content)
for anchor in dom('a'):
try:
link = anchor.attrib['href']
except KeyError:
continue
new_uri = urlparse.urljoin(uri, link)
new_uris.add(new_uri)
uris = new_uris
if __name__ == '__main__':
import sys
if len(sys.argv) < 4:
print('usage: ' + sys.argv[0] +
"start_url crawl_depth term1 [term2 [...]]")
print(' ' + sys.argv[0] +
" http://yahoo.com 5 cute 'fluffy kitties'")
raise SystemExit
seed_uri = sys.argv[1]
crawl_depth = int(sys.argv[2])
search_terms = sys.argv[3:]
for uri, count, depth in crawl(seed_uri, crawl_depth, search_terms):
print(uri)
Теперь предположим, что я хочу найти все страницы, у которых есть «requireLazy(» в их источнике. Давайте попробуем это с facebook, если я выполню это:
python crawler.py https://www.facebook.com 4 '<script>requireLazy('
Он будет работать нормально в течение 2-4 секунд, и произойдет эта ошибка:
https://www.facebook.com
https://www.facebook.com/badges/?ref=pf
https://www.facebook.com/appcenter/category/music/?ref=pf
https://www.facebook.com/legal/terms
https://www.facebook.com/
...
Traceback (most recent call last):
File "crawler.py", line 61, in <module>
for uri, count, depth in crawl(seed_uri, crawl_depth, search_terms):
File "crawler.py", line 38, in crawl
dom = pyquery.PyQuery(content)
File "/usr/local/lib/python2.7/dist-packages/pyquery/pyquery.py", line 226, in __init__
elements = fromstring(context, self.parser)
File "/usr/local/lib/python2.7/dist-packages/pyquery/pyquery.py", line 70, in fromstring
result = getattr(lxml.html, meth)(context)
File "/usr/lib/python2.7/dist-packages/lxml/html/__init__.py", line 634, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/usr/lib/python2.7/dist-packages/lxml/html/__init__.py", line 532, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2754, in lxml.etree.fromstring (src/lxml/lxml.etree.c:54631)
File "parser.pxi", line 1578, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:82748)
File "parser.pxi", line 1457, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:81546)
File "parser.pxi", line 965, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:78216)
File "parser.pxi", line 569, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:74472)
File "parser.pxi", line 650, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:75363)
File "parser.pxi", line 599, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:74827) lxml.etree.XMLSyntaxError: line 21: Tag fb:like invalid
Может ли кто-нибудь помочь мне исправить эту ошибку? Спасибо.