lang/py

python proxy scraping

C/H 2019. 1. 29. 08:30

프록시 이용 스크래핑

파이삭스 PySocks

pip install pysocks
# pip install pysocks
import socks
import socket
from urllib.request import urlopen

try:
	# http://socks-proxy.net/
	# set_proxy(proxy_type, addr[, port[, rdns[, username[, password]]]])
	socks.set_default_proxy(socks.SOCKS4, "36.91.54.151", 33474)
	socket.socket = socks.socksocket

	rs = urlopen('http://icanhazip.com').read()
	print( rs )
except Exception as err:
	print(repr(err))

셀레니움

from selenium import webdriver

try:
    proxy = {
        "ip": '36.91.54.151',
        "port": 33474,
        'type': 4,
    }
    proxy['phantomjs'] = ['--proxy='+proxy['ip']+':'+str(proxy['port']), '--proxy-type=socks'+str(proxy['type'])]
    proxy['chrome'] = '--proxy-server=socks'+str(proxy['type'])+'://'+proxy['ip']+':'+str(proxy['port'])
    browser = '-firefox'

    if browser == 'phantom':
        driver = webdriver.PhantomJS(executable_path="C:\\User\\uncao\\scoop\\shim\\phantomjs.exe", service_args=proxy['phantomjs']) # 2018년이후 PhanomJs 개발이 되지 않는 상태
    elif browser == 'firefox':
        # https://www.programcreek.com/python/example/100026/selenium.webdriver.FirefoxProfile
        options = webdriver.FirefoxOptions()
        options.headless = True
        profile = webdriver.FirefoxProfile()
        profile.set_preference("network.proxy.type", 1)
        profile.set_preference("network.proxy.socks", proxy.get('ip'))
        profile.set_preference("network.proxy.socks_port", proxy.get('port'))
        profile.set_preference("network.proxy.socks_version", proxy.get('type'))
        profile.update_preferences()
        driver = webdriver.Firefox(options=options, firefox_profile=profile)
    else:
        # https://beomi.github.io/2017/09/28/HowToMakeWebCrawler-Headless-Chrome/
        options = webdriver.ChromeOptions()
        options.add_argument(proxy['chrome'])
        options.add_argument('headless')
        options.add_argument('window-size=1920x1080')
        options.add_argument("disable-gpu")
        options.add_argument("lang=ko_KR")  # 한국어!
        options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36") # off headless option
        driver = webdriver.Chrome(options=options)

        driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") # 임의 플러그인 추가
        driver.execute_script("Object.defineProperty(navigator, 'languages', {get: function() {return ['ko-KR', 'ko']}})") # language 속성 업데이트
        driver.execute_script("const getParameter = WebGLRenderingContext.getParameter;WebGLRenderingContext.prototype.getParameter = function(parameter) {if (parameter === 37445) {return 'NVIDIA Corporation'} if (parameter === 37446) {return 'NVIDIA GeForce GTX 980 Ti OpenGL Engine';}return getParameter(parameter);};") # GTX980Ti


    driver.implicitly_wait(3)
    driver.get('http://icanhazip.com')
    print(driver.page_source)
    driver.close()
except Exception as err:
   print(repr(err))
파이썬으로 웹 크롤러 만들기 한빛미디어
14.2.1 파이삭스, 250p


반응형

'lang > py' 카테고리의 다른 글

Book 파이썬답게 코딩하기 - 철학과 개념  (0) 2020.01.08
Udemy Download  (0) 2019.04.08
library Tesseract - OCR test  (0) 2019.01.25
library Tesseract - OCR  (0) 2019.01.24
library Pillow - thumbnail create  (0) 2019.01.23