Coverage for src\scrape\info\info_scraper.py: 100%
82 statements
« prev ^ index » next coverage.py v7.10.1, created at 2025-10-19 21:13 +0800
« prev ^ index » next coverage.py v7.10.1, created at 2025-10-19 21:13 +0800
1from bs4 import BeautifulSoup
2from bs4.element import Tag
3from msgspec import Struct
4from src.scrape.base_scraper import BaseScraper
5from src.utility.lib import MsgSpecJSONResponse
6from src.utility.utils import OtherInfo, PersonInfo, Utils, ViewType
7from typing import Any, Dict, List, Tuple
10class InfoScraper(BaseScraper):
11 def __init__(self, soup: BeautifulSoup, params: Dict, view: ViewType) -> None:
12 super().__init__(soup, params, view)
14 self.detail_head = self.soup.select_one("div.p-content-detail__head") or self.soup.select_one("div.p-timeline-mark")
15 self.detail_foot = self.soup.select_one("div.p-content-detail__foot") or self.soup.select_one("div.p-profile__main")
17 self.page_number = int(self.params.get("page", 1))
18 self.data = {}
20 def get_response(self) -> Dict[str, Any]:
21 return {
22 "data": self.data,
23 "scrape_date": Utils.get_scrape_date(),
24 }
26 def get_logging(self, id: List[int], text: str) -> str:
27 return f"[{self.view.value}] [ID: {', '.join(str(i) for i in id)}] {text}"
29 def _get_title(self) -> str:
30 selectors = ["h2.p-content-detail__title > span", "h2.c-content-box-s__title", "div.p-timeline-mark__title > a"]
31 for sel in selectors:
32 if title := self.detail_head.select_one(sel):
33 return title.find(string=True, recursive=False).text
35 def _get_original_title(self) -> str | None:
36 selectors = ["p.p-content-detail__original", "p.c-content-box-s__original"]
37 for sel in selectors:
38 if title := self.detail_head.select_one(sel):
39 return title.text
41 return None
43 def _get_synopsis(self) -> str | None:
44 synopsis = self.detail_head.select_one("#js-content-detail-synopsis")
46 return synopsis.select_one("content-detail-synopsis").get(":outline").strip('"') if synopsis else None
48 def _get_rating(self) -> float | str:
49 selectors = ["div.c2-rating-l__text", "div.c2-rating-m__text", "div.c-rating__score"]
50 for sel in selectors:
51 if rating := self.detail_head.select_one(sel):
52 rating = rating.text
53 return float(rating) if rating != "-" else rating
55 def _get_data_mark(self) -> Struct:
56 selectors = ["div.c-content__counts > div.js-btn-mark", "div.c-content__actions > div.js-btn-mark"]
57 for sel in selectors:
58 if data_elem := self.detail_head.select_one(sel):
59 return MsgSpecJSONResponse.parse(content=data_elem.attrs["data-mark"], type=self.view.mark)
61 def _get_data_clip(self) -> Struct:
62 selectors = ["div.c-content__counts > div.js-btn-clip", "div.c-content__actions > div.js-btn-clip"]
63 for sel in selectors:
64 if data_elem := self.detail_head.select_one(sel):
65 return MsgSpecJSONResponse.parse(content=data_elem.attrs["data-clip"], type=self.view.clip)
67 def _get_link(self) -> str:
68 return self.soup.select_one("link").attrs["href"]
70 def _get_official_site(self) -> str | None:
71 link = self.detail_head.select_one("li.p-content-detail-links__item--official > a")
73 return link.attrs["href"] if link else None
75 def _get_poster(self) -> str | None:
76 poster = self.detail_head.select_one("div.c2-poster-l > img")
78 return poster.attrs["src"] if poster else None
80 def _get_production_year(self) -> Tuple[str] | None:
81 production_year = self.detail_head.select_one("h2.p-content-detail__title a")
83 return (Utils.create_filmarks_link(production_year.attrs["href"]), int(production_year.text.replace("年", ""))) if production_year else None
85 def _get_other_info(self, field: OtherInfo) -> str | List[str] | List[Dict[str, Any]] | None:
86 if field == OtherInfo.GENRE or field == OtherInfo.DISTRIBUTOR:
87 info_elem = self.detail_head.find("h3", class_="p-content-detail__secondary-info-title", string=lambda s: s.startswith(field.title))
89 else:
90 info_elem = self.detail_head.find("h3", class_="p-content-detail__primary-info-title", string=lambda s: s.startswith(field.title))
92 if field in OtherInfo.single_fields():
93 return info_elem.text.replace(field.title, "") if info_elem else None
95 elif self.view == ViewType.ANIME and field == OtherInfo.COUNTRY_OF_ORIGIN:
96 return [name.text for name in info_elem.find_next_sibling("ul").find_all("li")] if info_elem else None
98 else:
99 return [
100 Utils.create_other_info(
101 name=other.text,
102 link=other.attrs["href"]
103 )
104 for other
105 in info_elem.find_next_sibling("ul").find_all("a")
106 ] if info_elem else None
108 def _get_person_info(self, field: PersonInfo) -> List[Dict[str, Any]] | None:
109 if field == PersonInfo.CAST:
110 info_elem = self.detail_head.select_one("div.p-people-list__casts")
112 return [
113 Utils.create_person_info(
114 name=person.select_one("div.c2-button-tertiary-s-multi-text__text").text,
115 link=person.select_one("a").attrs["href"],
116 character=character.text if (character := person.select_one("div.c2-button-tertiary-s-multi-text__subtext")) else ""
117 )
118 for person
119 in info_elem.select("h4.p-people-list__item")
120 ] if info_elem else None
122 else:
123 info_elem = self.detail_head.find("h3", class_="p-content-detail__people-list-term", string=field.title)
125 return [
126 Utils.create_person_info(
127 name=person.find("div").text,
128 link=person.find("a").attrs["href"]
129 )
130 for person
131 in info_elem.find_next_sibling("ul").find_all("li")
132 ] if info_elem else None
134 def _is_reviews_empty(self) -> Tag | None:
135 condition = self.detail_foot.select_one("div.p2-empty-reviews-message__text")
137 return condition
139 def _get_review(self) -> Dict[str, Any]:
140 return Utils.create_review_info(
141 user_name=self.detail_foot.select_one("h2.p-profile__name > a").text,
142 user_link=self.detail_foot.select_one("div.p-profile__content > a").attrs["href"],
143 review_date=self.detail_head.select_one("time.c-media__date").text,
144 review_rating=float(rating) if (rating := self.detail_head.select_one("div.c-rating__score").text) != "-" else rating,
145 review_contents=self.detail_head.select_one("div.p-mark-review").get_text(separator=" ", strip=True),
146 )
148 def _get_review_info(self) -> List[Dict[str, Any]] | None:
149 info_elem = self.detail_foot.select("div.p-mark")
151 return [
152 Utils.create_review_info(
153 user_name=review.select_one("div.c2-user-m__heading a").text.replace("の感想・評価", ""),
154 user_link=review.select_one("div.c2-user-m > a").attrs["href"],
155 review_date=review.select_one("time.c-media__date").text,
156 review_rating=float(rating) if (rating := review.select_one("div.c2-rating-s__text").text) != "-" else rating,
157 review_contents= rev.get_text(separator=" ", strip=True) if (rev := review.select_one("div.p-mark-review")) else "",
158 review_link=review.select_one("div.c2-user-m__heading a").attrs["href"],
159 )
160 for review
161 in info_elem
162 ] if info_elem else None