BrauchenPython

Python-Programme
Anonymous
 Brauchen

Post by Anonymous »

Ich kratze täglich Daten mit dem Python -Dramatiker. Auf meiner lokalen Windows 10 -Maschine hatte ich zunächst einige Probleme, aber ich habe Dinge mit Browserforge + Residential Smart Proxy (für Fingerabdrücke und legitimale IPs) zum Laufen gebracht. Dieses Setup hat perfekt funktioniert, aber nur lokal. Ich benutze AWS -Batch mit Fargate, um die Skripte auszuführen, und dort bricht alles. < /P>
Nachdem ich 403 Fehler in der Cloud getroffen habe, habe ich Alternativen wie Camoufox und PatchRight ausprobiert. Sie arbeiten im Kopf -Modus, sobald ich sie auf AWS fahre. Der Captcha verlangt, dass Sie einen Knopf gedrückt haben, und selbst wenn ich ihn manuell löste, bekomme ich immer noch 403s danach. Nutzlos. < /p>
Zu diesem Zeitpunkt habe ich keine Ideen. Hat jemand es geschafft, EasyPara.fr zuverlässig von AWS (insbesondere bei Dramatikern) zu kratzen? Irgendwelche Tricks, Setups oder Tools, die ich vielleicht verpasst habe? Ich habe mehrere andere Eretailer mit Cloudflare und erweitertem Captchas -Schutz (Eva.ua, walmart.com.mx, Chewy.com usw.).

Code: Select all

from patchright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError, Page, BrowserContext, Request, Error as PlaywrightError
from pathlib import Path
import os
from io import BytesIO
from datetime import datetime
import re as reg
import random
import shutil
import asyncio
from oxymouse import OxyMouse

from logs.logs import module_logger

class BrowserObject:
def __init__(self, page: Page) -> None:
if page:
self.browser_context: BrowserContext = page.context
self.page: Page = page
self.initial_call: bool = True
self.count_requests: int = 0

class AsyncPatchRightDriver:
def __init__(
self,
async_playwright: async_playwright,
args: list = None,
headless: bool = False,
proxy: dict = None,
route_block: list = [],
resource_block: list = [],
rotate_every: int = 10,
user_data_dir: str = os.path.join(Path.home(), ".patchright_user_data")  # Added for persistent context
):
self.route_block = route_block
self.resource_block = resource_block
self.async_playwright = async_playwright
self.browser_name_literal = "chromium"  # Hardcoded to chromium because it is the only one supported by Patchright
self.browser_name = self.async_playwright.chromium
self.headless = headless
self.args = args
self.proxy = proxy
self.rotate_every = rotate_every
self.logger = module_logger
self.s3_utils = S3Utils(logger=self.logger)
self.user_data_dir = user_data_dir
self.headers_printed = False

async def async_init(self):
await self.clear_user_data()
self.browser_object = await self.create_new_page_with_context()
return self

def __await__(self):
return self.async_init().__await__()

async def emulate_human_behavior(self, browser_object: BrowserObject = None):
"""Emulates human-like browsing behavior using OxyMouse for mouse movements."""
if not browser_object:
browser_object = self.browser_object

try:
# Get current viewport size
viewport = await browser_object.page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })")

mouse = OxyMouse(algorithm="gaussian")
self.logger.info("Initialized OxyMouse with gaussian algorithm")
await asyncio.sleep(random.uniform(0.05, 0.15))

# Step 1: Initial scroll
first_scroll = random.randint(100, 300)
await browser_object.page.evaluate(f"window.scrollBy(0, {first_scroll})")
self.logger.info(f"Performed initial scroll by {first_scroll} pixels")
await asyncio.sleep(random.uniform(0.2, 0.5))

# Step 2: Generate 1-3 random mouse movements
num_movements = random.randint(1, 3)
self.logger.info(f"Generating {num_movements} mouse movement sequences")
for i in range(num_movements):
# Generate random coordinates within viewport
movements = mouse.generate_random_coordinates(
viewport_width=viewport["width"],
viewport_height=min(600, first_scroll + 200)
)[:5]  # Limit to 5 coordinates per sequence

for x, y in movements:
await browser_object.page.mouse.move(x, y)
await asyncio.sleep(random.uniform(0.03, 0.06))

self.logger.info(f"Completed mouse movement sequence {i+1}")
await asyncio.sleep(random.uniform(0.15, 0.35))

# Step 3: Occasionally scroll back up
if random.random() >  0.8:
scroll_back = random.randint(30, 100)
await browser_object.page.evaluate(f"window.scrollBy(0, -{scroll_back})")
self.logger.info(f"Scrolled back up by {scroll_back} pixels")
await asyncio.sleep(random.uniform(0.15, 0.3))

# Step 4: Occasionally scroll further down
if random.random() > 0.6:
scroll_down = random.randint(100, 400)
await browser_object.page.evaluate(f"window.scrollBy(0, {scroll_down})")
self.logger.info(f"Scrolled further down by {scroll_down} pixels")
await asyncio.sleep(random.uniform(0.15, 0.3))

self.logger.info("Human-like behavior emulation completed with OxyMouse")
except Exception as e:
self.logger.warning(f"Error during human behavior emulation with OxyMouse: {e}")
raise

async def create_new_page_with_context(self, browser_object: BrowserObject = None) ->  BrowserObject:
if not hasattr(self, "browser_context"):
self.browser_context = await self.browser_name.launch_persistent_context(
user_data_dir=self.user_data_dir,
channel="chrome",
headless=self.headless,
no_viewport=True,
args=self.args,
proxy=self.proxy,
)
module_logger.info("PatchRight browser context launched successfully")
page = await self.browser_context.new_page()
if browser_object:
browser_object.page = page
browser_object.browser_context = self.browser_context
browser_object.count_requests = 0
browser_object.initial_call = True
return browser_object
else:
browser_object = BrowserObject(page)
return browser_object

async def async_go(self, url, browser_object: BrowserObject = None):
if not browser_object:
browser_object = self.browser_object
await self.update_page_count(browser_object=browser_object)

if self.resource_block:
self.logger.info(f"Blocking resources: {self.resource_block}")
await browser_object.page.route("**/*", lambda route: self.block_resources(route, self.resource_block))
if self.route_block:
self.logger.info(f"Blocking network files: {self.route_block}")
await browser_object.page.route("**/*", lambda route: self.block_network_files(route, self.route_block))
if not self.headers_printed:
browser_object.page.on("request", lambda request: self.log_browser_info(request=request, url=url))

self.logger.info(f"Navigating to {url}")
await self.wait_random(0.5, browser_object=browser_object)

try:
response = await browser_object.page.goto(url, wait_until="load", timeout=60000)
await self.emulate_human_behavior(browser_object)
if response and 400

Quick Reply

Change Text Case: 
   
  • Similar Topics
    Replies
    Views
    Last post