Jun 122019With so many proxy website URLs all over the place, it's difficult to tell which one's actually have new proxies posted or if you're just receiving the same old proxies that are cluttering up your list and wasting time on testing. So, I wrote a spider that will scrape proxies off of URLs and compare the first 15 results to see how different the results are. Easy peasy.
I omitted the spider settings, Request func, and the callback func to keep it compact:
from scrapy import Spider
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from difflib import SequenceMatcher
import threading
import re
import csv
IPPortPatternGlobal = re.compile(
r'(?P<ip>(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?))' # noqa
r'(?=.*?(?:(?:(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?))|(?P<port>\d{2,5})))', # noqa
flags=re.DOTALL,
)
file_name = 'scout_results'
lock = threading.Lock()
threads = []
pdata = {}
with open(f"./data/{file_name}.csv") as file:
try:
results = csv.DictReader(file, delimiter=',')
next(results)
for row in results:
try:
if int(row["count"]) > 0:
pdata[row['url']] = {'first_15': row['first_15'], 'count': row['count']}
except Exception as e:
print(f'Error: {e}')
except:
pass
class SingleSpider(Spider):
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
global file_name
self.new_pdata = open(f"./data/{file_name}.csv", "w+")
self.new_pdata.write('url,first_15,count,ip_diff,c_diff\n')
def thread_compare(self, data):
with lock:
global pdata
url = data[0].strip()
f_15 = str(data[1]).strip()
count = str(data[2]).strip()
try:
ip_diff = str(self.compare(f_15, pdata[url]['first_15']))
count_diff = str(abs(int(count) - int(pdata[url]['count'])))
print(f'{url} - ip: {ip_diff} count: {count_diff}')
except Exception as e:
ip_diff = 'empty'
count_diff = 'empty'
print(f'Nothing to compare: {e}')
self.new_pdata.write(f'{url},{f_15},{count},{ip_diff},{count_diff}\n')
@staticmethod
def compare(block1, block2):
s = SequenceMatcher(lambda x: x in "\n", block1, block2)
return s.quick_ratio()
def spider_closed(self, spider):
self.new_pdata.close()
Jun 112019I've been coding again and just remembered how well this website works for keeping track of cool tricks I learn. Sometimes it's really hard to find simple and generic examples of things to help teach the fundamentals. I needed to write to a file without opening the text document 1000 times and I finally found a really clean example that helped me understand the pieces.
Edit** Threadpool is a lot easier and you can thread inside a loop:
from multiprocessing.pool import ThreadPool as Pool
threads = 100
p = Pool(threads)
p.map(function, list)
More complicated version:
import threading
lock = threading.Lock()
def thread_test(num):
phrase = "I am number " + str(num)
with lock:
print phrase
f.write(phrase + "\n")
threads = []
f = open("text.txt", 'w')
for i in range (100):
t = threading.Thread(target = thread_test, args = (i,))
threads.append(t)
t.start()
while threading.activeCount() > 1:
pass
else:
f.close()
Close something on Scrapy spider close without using a pipeline:
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
# second param is instance of spder about to be closed.
Instead of using an if time or if count to activate something I found a decorator that will make sure the function on runs once:
def run_once(f):
def wrapper(*args, **kwargs):
if not wrapper.has_run:
wrapper.has_run = True
return f(*args, **kwargs)
wrapper.has_run = False
return wrapper
@run_once
def my_function(foo, bar):
return foo+bar
You can also resize the terminal inside the code:
import sys
sys.stdout.write("\x1b[8;{rows};{cols}t".format(rows=46, cols=54))
I got stuck for a while trying to get my repository to let me login without creating an ssh key (super annoying imo) and I figured out that I added the ssh url for the origin url and needed to reset it to the http:
change origin url
git remote set-url origin <url-with-your-username>
Combine mp3 files with linux:
ls *.mp3
sudo apt-get install mp3wrap
mp3wrap output.mp3 *.mp3
Regex is always better than splitting a bunch of times and making the code messy. Plus it's a lot easier to pick up the code later on and figure out what's going on. So I decided to take my regex to the next level and start labeling groups (I'm even going to give it it's very own tag :3:
pat = r'(?<=\,\"searchResults\"\:\{)(?<list_results>.*)(?=\,\"resultsHash\"\:)'
m = re.match(pat, url)
if m:
self.domain = m.group('list_results')
Jan 312019You can catch 404 and connection errors by using errback= inside of the scrapy.Request object. From there I just add the failed proxy inside of the request meta to a list of failed proxies inside of the ProxyEngine class. If a proxy is seen inside of the failed list N times it can be removed with the ProxyEngine.remove_bad() class function. I also discovered that passing the download_timeout inside of the request meta works a lot better than inside of the Spider's global settings. Now the spider doesn't hang on slow or broken proxies and will be much much faster.
Next I plan to refactor the ProxyEngine data to serialize attempts so that I can catch proxies that have been banned by one domain, but not others. Also, I need to feed bad_proxies back into the request generator after being down for N time and save all of the proxy data to a database. Here's the code:
Proxy Engine
class ProxyEngine:
def __init__(self, limit=3):
self.proxy_list = []
self.bad_proxies = []
self.good_proxies = []
self.failed_proxies = []
self.limit = limit
def get_new(self, file='./proxies.txt'):
new_proxies = []
with open(file, 'r') as file:
for line in file:
new_proxies.append(f'https://{line.strip()}')
return [self.proxy_list.append(x) for x in new_proxies if x not in self.proxy_list and x not in self.bad_proxies]
def remove_bad(self):
for proxy in self.proxy_list:
if self.failed_proxies.count(proxy) >= self.limit:
self.bad_proxies.append(proxy)
return [self.proxy_list.remove(x) for x in self.proxy_list if x in self.bad_proxies]
Proxy Spider
class ProxyTest(Spider):
name = 'proxy_test'
custom_settings = {
'ITEM_PIPELINES': {
'__main__.ProxyPipeline': 400
},
'CONCURRENT_REQUESTS_PER_IP': 2,
}
def __init__(self):
self.prox = ProxyEngine(limit=20)
def start_requests(self):
self.prox.get_new()
for proxy in self.prox.proxy_list:
request = scrapy.Request("https://dashwood.net/post/python-3-new-string-formatting/456ft",
callback=self.get_title, errback=self.get_error, dont_filter=True)
request.meta['proxy'] = proxy
request.meta['dont_retry'] = True
request.meta['download_timeout'] = 5
yield request
def get_title(self, response):
print(response.status)
print('*' * 15)
def get_error(self, failure):
if failure.check(HttpError):
response = failure.value.response
print("HttpError occurred", response.status)
print('*' * 15)
elif failure.check(DNSLookupError):
request = failure.request
print("DNSLookupError occurred on", request.url)
print('*' * 15)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.prox.failed_proxies.append(request.meta["proxy"])
print("TimeoutError occurred", request.meta)
print('*' * 15)
else:
request = failure.request
print("Other Error", request.meta)
print(f'Proxy: {request.meta["proxy"]}')
self.prox.failed_proxies.append(request.meta["proxy"])
print('Failed:', self.prox.failed_proxies)
print('*' * 15)
Jan 072019I've been meaning to do this for a while now, but honestly it's been really difficult to find reference material to copy off of. Fortunatly today I found some really good repositories with almost exactly what I was looking for. Then, after I got it working, I combed the Scrapy docs very slowly and made sure that I understood all of the item loader functions and added simple examples / documentation on most of the features.
One Stand-alone Scrapy Script to Rule Them All
Basically what I wanted was a minimal clean Scrapy script that I could use in other projects without being tied down to the scrapy-cli project crap. I actually feel like I have full control of my script and have been taking great care to organize it correctly. Also, using item loaders / processors is really cool and should open the door to solve issues really cleanly.
Note I added a few interesting features to showcase some of the functionality of item loaders.
#! /usr/local/bin/python3
# -*- coding: utf-8 -*-
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join
from scrapy import Spider, Item, Field
from scrapy.settings import Settings
# Originally built off of:
# https://gist.github.com/alecxe/fc1527d6d9492b59c610
def extract_tag(self, values):
# Custom function for Item Loader Processor
for value in values:
yield value[5:-1]
class DefaultAwareItem(Item):
# Converts field default meta into default value fallback
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Use python's built-in setdefault() function on all items
for field_name, field_metadata in self.fields.items():
if not field_metadata.get('default'):
self.setdefault(field_name, 'No default set')
else:
self.setdefault(field_name, field_metadata.get('default'))
# Item Field
class CustomItem(DefaultAwareItem):
'''
Input / Output processors can also be declared in the field meta, e.g —
name = scrapy.Field(
input_processor=MapCompose(remove_tags),
output_processor=Join(),
)
'''
title = Field(default="No Title")
link = Field(default="No Links")
desc = Field()
tag = Field(default="No Tags")
class CustomItemLoader(ItemLoader):
'''
Item Loader declaration — input and output processors, functions
https://doc.scrapy.org/en/latest/topics/loaders.html#module-scrapy.loader.processors
Processors (Any functions applied to items here)
Identity() - leaves as is
TakeFirst - Takes first non null value
Join() - basically equivelent to u' '.join
Compose() - applies a list of functions one at a time **accepts loader_context
MapCompose() - applies a list of functions to a list of objects **accepts loader_context \
first function is applied to all objects then altered objects to next function etc..
https://doc.scrapy.org/en/latest/topics/loaders.html#declaring-input-and-output-processors
_in processors are applied to extractions as soon as received
_out processors are applied to collected data once loader.load_item() is yielded
single items are always converted to iterables
custom processor functions must receive self and values
'''
default_input_processor = MapCompose(str.strip)
default_output_processor = TakeFirst()
desc_out = Join()
tag_in = extract_tag # function assigned as class variable
tag_out = Join(', ')
# Define a pipeline
class WriterPipeline(object):
def __init__(self):
self.file = open('items.txt', 'w')
def process_item(self, item, spider):
self.file.write(item['title'] + '\n')
self.file.write(item['link'] + '\n')
self.file.write(item['desc'] + '\n')
self.file.write(item['tag'] + '\n\n')
return item
# Define a spider
class CustomSpider(Spider):
name = 'single_spider'
allowed_domains = ['dashwood.net']
start_urls = ['https://dashwood.net/']
def parse(self, response):
for sel in response.xpath('//article'):
loader = CustomItemLoader(
CustomItem(), selector=sel, response=response)
loader.add_xpath('title', './/h2/a/text()')
loader.add_xpath('link', './/a/@href')
loader.add_xpath('desc', './/p/text()')
loader.add_xpath('tag', './/a[@class="tag"]//@href')
yield loader.load_item()
# Declare some settings / piplines
settings = Settings({
# piplines start with the project/module name so replace with __main__
'ITEM_PIPELINES': {
'__main__.WriterPipeline': 100,
},
'DEFAULT_REQUEST_HEADERS': {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch',
'accept-language': 'en-US,en;q=0.8',
'upgrade-insecure-requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3'
},
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
}
})
process = CrawlerProcess(settings)
# you can run 30 of these at once if you want, e.g —
# process.crawl(CustomSpider)
# process.crawl(CustomSpider) etc.. * 30
process.crawl(CustomSpider)
process.start()