Coverage for src\scrape\search\search_scraper.py: 100%

57 statements  

« prev     ^ index     » next       coverage.py v7.10.1, created at 2025-10-19 21:13 +0800

1from bs4 import BeautifulSoup 

2from bs4.element import ResultSet, Tag 

3from msgspec import Struct 

4from src.scrape.base_scraper import BaseScraper 

5from src.utility.lib import Logger, MsgSpecJSONResponse 

6from src.utility.utils import OtherInfo, PersonInfo, Utils, ViewType 

7from typing import Any, Dict, List 

8 

9 

10class SearchScraper(BaseScraper): 

11 def __init__(self, soup: BeautifulSoup, params: Dict, view: ViewType) -> None: 

12 super().__init__(soup, params, view) 

13 

14 self.results_limit = self.params.get("limit", 10) 

15 self.page_number = self.params.get("page", 1) 

16 

17 self.search_query = self.params.get("q", "") 

18 self.search_heading = "" 

19 self.search_results = {} 

20 

21 def get_response(self) -> Dict[str, Any]: 

22 return { 

23 "query": self.search_query, 

24 "heading": self.search_heading, 

25 "results": self.search_results, 

26 "scrape_date": Utils.get_scrape_date(), 

27 } 

28 

29 def get_logging(self, idx: int, text: str) -> str: 

30 return f"[{self.view.value}] [{idx} | Query: {self.search_query} | Heading: {self.search_heading} | Page: {self.page_number}] {text}" 

31 

32 def _get_heading(self) -> str: 

33 selectors = ["h1.c-heading-1", "h1.c-page-title__title"] 

34 for sel in selectors: 

35 if heading := self.soup.select_one(sel): 

36 return heading.text 

37 

38 return "" 

39 

40 def _is_results_empty(self) -> Tag | None: 

41 condition = self.soup.select_one("div.p-timeline__zero") 

42 if condition: 

43 Logger.warn(self.get_logging(idx=0, text=condition.text)) 

44 

45 return condition 

46 

47 def _get_results_container(self) -> ResultSet[Tag]: 

48 container = self.soup.select("div.p-contents-grid > div.js-cassette") 

49 

50 return container 

51 

52 def _get_title(self, result: Tag) -> str: 

53 return result.select_one("h3.p-content-cassette__title").text 

54 

55 def _get_rating(self, result: Tag) -> float | str: 

56 rating = result.select_one("div.c-rating__score").text 

57 

58 return float(rating) if rating != "-" else rating 

59 

60 def _get_data_mark(self, result: Tag) -> Struct: 

61 return MsgSpecJSONResponse.parse(content=result.attrs["data-mark"], type=self.view.mark) 

62 

63 def _get_data_clip(self, result: Tag) -> Struct: 

64 return MsgSpecJSONResponse.parse(content=result.attrs["data-clip"], type=self.view.clip) 

65 

66 def _get_poster(self, result: Tag) -> str | None: 

67 poster = result.select_one("div.c2-poster-m > img") 

68 

69 return poster.attrs["src"] if poster else None 

70 

71 def _get_other_info(self, result: Tag, field: OtherInfo) -> str | List[str] | None: 

72 if field == OtherInfo.GENRE: 

73 info_elem = result.find("h4", class_="p-content-cassette__genre-title") 

74 

75 elif field == OtherInfo.DISTRIBUTOR: 

76 info_elem = result.find("h4", class_="p-content-cassette__distributor-title") 

77 

78 else: 

79 info_elem = result.find("h4", class_="p-content-cassette__other-info-title", string=field.title) 

80 

81 if field in OtherInfo.single_fields(): 

82 return info_elem.find_next_sibling("span").text if info_elem else None 

83 

84 else: 

85 return [name.text for name in info_elem.find_next_sibling("ul").find_all("a")] if info_elem else None 

86 

87 def _get_person_info(self, result: Tag, field: PersonInfo) -> List[str] | None: 

88 info_elem = result.find("h4", class_="p-content-cassette__people-list-term", string=field.title) 

89 

90 return [name.text for name in info_elem.find_next_sibling("ul").find_all("a")] if info_elem else None