Files
trendyol-analiz/backend/scraper.py
furkanyigit34 706f957ba2 fix(scraper): pathModel -x-c{id} suffix + defensive int cast
Ne yaptık:
- queue_worker.py find_leaves: kategori URL'sine -x-c{id} suffix ekledik
  (ör: makyaj-cantasi -> makyaj-cantasi-x-c1110)
- scraper.py fetch_all_products: total degeri str gelirse int'e cast
  ediyoruz (roughTotal "0" string donuyordu, '/' operatoru patliyordu)

Neden yaptık:
- Trendyol Search API path formati degisti, artik suffix'siz cagri 0
  urun donduruyor (ornek test: makyaj-cantasi=0, makyaj-cantasi-x-c1110=15.712)
- Yan etki olarak 'total' int 0 falsy oldugu icin 'or' chain
  roughTotal'a dusuyor, bu da string olarak donuyor ve math.ceil(str/int)
  TypeError firlatiyordu. Tum 35 alt kategori bu yuzden patladi
  (rapor 62 "Makyaj Analizi" sifir urun).
- Fix iki seviyeli: asil cozum suffix (artik dogru data ceker), defansif
  int() cast gelecekte API'nin baska sekilde tutarsiz donmesine karsi
  guvenlik agi.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-25 16:38:13 +03:00

361 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Trendyol Best Seller Scraper - Backend Integration
Veritabanından kategorileri alıp otomatik çeker
"""
import requests
import json
import time
import math
import os
from typing import Dict, List, Any, Optional
from datetime import datetime
from logging_config import get_logger
log = get_logger("scraper")
class TrendyolScraper:
"""Trendyol API'den best seller ürünlerini çeker"""
API_BASE_URL = "https://apigw.trendyol.com/discovery-sfint-browsing-service/api/top-rankings/top-ranking-contents"
def __init__(self, category_id: int, page_size: int = 20):
"""
Args:
category_id: Trendyol kategori ID
page_size: Sayfa başına ürün sayısı (max 20)
"""
self.category_id = category_id
self.page_size = min(page_size, 20)
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept": "application/json",
"Referer": "https://www.trendyol.com/"
}
def fetch_page(self, page: int) -> Optional[Dict[str, Any]]:
"""Tek sayfa çeker"""
params = {
"categoryId": self.category_id,
"rankingType": "bestSeller",
"webGenderId": 1,
"page": page,
"pageSize": self.page_size,
"channelId": 1,
"storefrontId": 1,
"language": "tr",
"countryCode": "TR"
}
try:
response = requests.get(
self.API_BASE_URL,
params=params,
headers=self.headers,
timeout=10
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
log.warning(f"Sayfa {page} error: {e}")
return None
def get_total_count(self) -> int:
"""Toplam ürün sayısını öğrenir"""
data = self.fetch_page(page=1)
if not data or not data.get('isSuccess'):
return 0
return data.get('totalCount', 0)
def calculate_total_pages(self, total_count: int, max_pages: int = None) -> int:
"""Kaç sayfa çekeceğimizi hesaplar"""
total_pages = math.ceil(total_count / self.page_size)
# Max sayfa limiti varsa uygula
if max_pages:
total_pages = min(total_pages, max_pages)
return total_pages
def fetch_all_products(self, delay: float = 1.0, max_pages: int = 5) -> List[Dict[str, Any]]:
"""
Ürünleri çeker
Args:
delay: İstekler arası bekleme süresi
max_pages: Maksimum sayfa sayısı (default: 5 = 100 ürün)
Returns:
Ürün listesi
"""
# Toplam ürün sayısını öğren
total_count = self.get_total_count()
if total_count == 0:
return []
# Sayfa sayısını hesapla
total_pages = self.calculate_total_pages(total_count, max_pages)
log.info(f"Kategori {self.category_id}: {total_count} ürün, {total_pages} sayfa çekilecek")
# Sayfaları çek
all_products = []
for page in range(1, total_pages + 1):
data = self.fetch_page(page)
if not data or not data.get('isSuccess'):
log.warning(f"Sayfa {page} atlandı")
continue
products = data.get('products', [])
all_products.extend(products)
# Rate limiting
if page < total_pages:
time.sleep(delay)
return all_products
def save_to_json(self, products: List[Dict[str, Any]], filename: str) -> bool:
"""
JSON dosyasına kaydeder
Args:
products: Ürün listesi
filename: Dosya yolu
Returns:
Başarılı mı?
"""
try:
# Dizin yoksa oluştur
os.makedirs(os.path.dirname(filename), exist_ok=True)
output = {
"scraped_at": datetime.now().isoformat(),
"category_id": self.category_id,
"total_products": len(products),
"products": products
}
with open(filename, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
return True
except Exception as e:
log.error(f"Dosya kaydetme hatası: {e}")
return False
def get_category_info(self) -> Optional[Dict[str, Any]]:
"""Kategori bilgilerini döndürür"""
data = self.fetch_page(page=1)
if not data or not data.get('isSuccess'):
return None
return data.get('categoryInfo', {})
class TrendyolSearchScraper:
"""Trendyol Search API ile ürün çeker — tüm kategori tipleri için çalışır (-c ve -s)"""
API_BASE_URL = "https://apigw.trendyol.com/discovery-sfint-search-service/api/search/products"
def __init__(self, path_model: str, page_size: int = 24):
self.path_model = path_model
self.page_size = page_size
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept": "application/json",
"Referer": f"https://www.trendyol.com/{path_model}",
"Origin": "https://www.trendyol.com"
}
self.cookies = {
"storefrontId": "1",
"language": "tr",
"countryCode": "TR"
}
def fetch_page(self, page: int) -> Optional[Dict[str, Any]]:
"""Tek sayfa çeker"""
params = {
"pathModel": self.path_model,
"pi": page,
"ps": self.page_size,
"channelId": 1,
"storefrontId": 1,
"culture": "tr-TR"
}
try:
response = requests.get(
self.API_BASE_URL,
params=params,
headers=self.headers,
cookies=self.cookies,
timeout=15
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
log.warning(f"Search API sayfa {page} error ({self.path_model}): {e}")
return None
def fetch_all_products(self, delay: float = 1.0, max_pages: int = 10) -> List[Dict[str, Any]]:
"""Tüm ürünleri çeker, normalize eder (max_pages=10 x page_size=24 = 240 ürün)"""
first = self.fetch_page(1)
if not first:
return []
total_raw = first.get("total") or first.get("totalCount") or first.get("roughTotal") or 0
try:
total = int(total_raw)
except (ValueError, TypeError):
total = 0
raw_products = first.get("products", [])
if total == 0 and not raw_products:
return []
# total 0 olsa bile ürün varsa en az 1 sayfa çek
if total == 0 and raw_products:
total = len(raw_products)
total_pages = min(math.ceil(total / self.page_size), max_pages)
log.info(f"Search API {self.path_model}: {total} ürün, {total_pages} sayfa çekilecek")
for page in range(2, total_pages + 1):
data = self.fetch_page(page)
if data and data.get("products"):
raw_products.extend(data["products"])
if page < total_pages:
time.sleep(delay)
return [_normalize_search_product(p) for p in raw_products]
def _normalize_search_product(raw: dict) -> dict:
"""Search API ürün formatını mevcut sisteme uyumlu hale getir"""
brand = raw.get("brand", {})
if isinstance(brand, str):
brand = {"name": brand}
price = raw.get("price", {})
if isinstance(price, (int, float)):
price = {"sellingPrice": price, "originalPrice": price}
elif isinstance(price, dict) and "sellingPrice" not in price:
# Search API returns current/discountedPrice/originalPrice — map to sellingPrice
price["sellingPrice"] = price.get("discountedPrice") or price.get("current") or price.get("originalPrice") or price.get("old") or 0
rating = raw.get("ratingScore", {})
if rating is None:
rating = {}
return {
"id": raw.get("id") or raw.get("contentId"),
"name": raw.get("name", ""),
"brand": brand,
"price": price,
"ratingScore": rating,
"url": raw.get("url", ""),
"imageUrl": raw.get("image", raw.get("imageUrl", "")),
"merchantListings": raw.get("merchantListings", []),
"winnerVariant": raw.get("winnerVariant", {}),
"socialProofs": raw.get("socialProofs", []),
"categoryId": raw.get("categoryId"),
"categoryName": raw.get("categoryName"),
}
def scrape_category(category_id: int, category_name: str, output_dir: str = "../categories") -> Dict[str, Any]:
"""
Tek bir kategoriyi çeker
Args:
category_id: Trendyol kategori ID
category_name: Kategori adı
output_dir: JSON dosyalarının kaydedileceği dizin
Returns:
Scraping sonuçları
"""
result = {
"category_id": category_id,
"category_name": category_name,
"success": False,
"total_products": 0,
"file_path": None,
"error": None
}
try:
# Scraper oluştur
scraper = TrendyolScraper(category_id=category_id, page_size=20)
# Ürünleri çek (max 5 sayfa = 100 ürün)
products = scraper.fetch_all_products(delay=1.0, max_pages=5)
if not products:
result["error"] = "No products found"
return result
# JSON'a kaydet
filename = f"{output_dir}/{category_name}_{category_id}.json"
success = scraper.save_to_json(products, filename)
if success:
result["success"] = True
result["total_products"] = len(products)
result["file_path"] = filename
else:
result["error"] = "Failed to save JSON"
except Exception as e:
result["error"] = str(e)
return result
def scrape_multiple_categories(categories: List[tuple], delay: float = 2.0) -> Dict[str, Any]:
"""
Birden fazla kategoriyi çeker
Args:
categories: [(category_id, category_name), ...] listesi
delay: Kategoriler arası bekleme süresi
Returns:
Genel sonuçlar
"""
results = {
"scraped_at": datetime.now().isoformat(),
"total_categories": len(categories),
"successful": 0,
"failed": 0,
"total_products": 0,
"details": []
}
for i, (cat_id, cat_name) in enumerate(categories, 1):
log.info(f"[{i}/{len(categories)}] {cat_name} (ID: {cat_id})")
result = scrape_category(cat_id, cat_name)
results["details"].append(result)
if result["success"]:
results["successful"] += 1
results["total_products"] += result["total_products"]
log.info(f"Başarılı: {result['total_products']} ürün")
else:
results["failed"] += 1
log.error(f"Hata: {result['error']}")
# Kategoriler arası bekleme
if i < len(categories):
time.sleep(delay)
return results