Coverage for src\scrape\search\search_scraper.py: 100%
57 statements
« prev ^ index » next coverage.py v7.10.1, created at 2025-10-19 21:13 +0800
« prev ^ index » next coverage.py v7.10.1, created at 2025-10-19 21:13 +0800
1from bs4 import BeautifulSoup
2from bs4.element import ResultSet, Tag
3from msgspec import Struct
4from src.scrape.base_scraper import BaseScraper
5from src.utility.lib import Logger, MsgSpecJSONResponse
6from src.utility.utils import OtherInfo, PersonInfo, Utils, ViewType
7from typing import Any, Dict, List
10class SearchScraper(BaseScraper):
11 def __init__(self, soup: BeautifulSoup, params: Dict, view: ViewType) -> None:
12 super().__init__(soup, params, view)
14 self.results_limit = self.params.get("limit", 10)
15 self.page_number = self.params.get("page", 1)
17 self.search_query = self.params.get("q", "")
18 self.search_heading = ""
19 self.search_results = {}
21 def get_response(self) -> Dict[str, Any]:
22 return {
23 "query": self.search_query,
24 "heading": self.search_heading,
25 "results": self.search_results,
26 "scrape_date": Utils.get_scrape_date(),
27 }
29 def get_logging(self, idx: int, text: str) -> str:
30 return f"[{self.view.value}] [{idx} | Query: {self.search_query} | Heading: {self.search_heading} | Page: {self.page_number}] {text}"
32 def _get_heading(self) -> str:
33 selectors = ["h1.c-heading-1", "h1.c-page-title__title"]
34 for sel in selectors:
35 if heading := self.soup.select_one(sel):
36 return heading.text
38 return ""
40 def _is_results_empty(self) -> Tag | None:
41 condition = self.soup.select_one("div.p-timeline__zero")
42 if condition:
43 Logger.warn(self.get_logging(idx=0, text=condition.text))
45 return condition
47 def _get_results_container(self) -> ResultSet[Tag]:
48 container = self.soup.select("div.p-contents-grid > div.js-cassette")
50 return container
52 def _get_title(self, result: Tag) -> str:
53 return result.select_one("h3.p-content-cassette__title").text
55 def _get_rating(self, result: Tag) -> float | str:
56 rating = result.select_one("div.c-rating__score").text
58 return float(rating) if rating != "-" else rating
60 def _get_data_mark(self, result: Tag) -> Struct:
61 return MsgSpecJSONResponse.parse(content=result.attrs["data-mark"], type=self.view.mark)
63 def _get_data_clip(self, result: Tag) -> Struct:
64 return MsgSpecJSONResponse.parse(content=result.attrs["data-clip"], type=self.view.clip)
66 def _get_poster(self, result: Tag) -> str | None:
67 poster = result.select_one("div.c2-poster-m > img")
69 return poster.attrs["src"] if poster else None
71 def _get_other_info(self, result: Tag, field: OtherInfo) -> str | List[str] | None:
72 if field == OtherInfo.GENRE:
73 info_elem = result.find("h4", class_="p-content-cassette__genre-title")
75 elif field == OtherInfo.DISTRIBUTOR:
76 info_elem = result.find("h4", class_="p-content-cassette__distributor-title")
78 else:
79 info_elem = result.find("h4", class_="p-content-cassette__other-info-title", string=field.title)
81 if field in OtherInfo.single_fields():
82 return info_elem.find_next_sibling("span").text if info_elem else None
84 else:
85 return [name.text for name in info_elem.find_next_sibling("ul").find_all("a")] if info_elem else None
87 def _get_person_info(self, result: Tag, field: PersonInfo) -> List[str] | None:
88 info_elem = result.find("h4", class_="p-content-cassette__people-list-term", string=field.title)
90 return [name.text for name in info_elem.find_next_sibling("ul").find_all("a")] if info_elem else None