Coverage for src\scrape\info\info_scraper.py: 100%

82 statements  

« prev     ^ index     » next       coverage.py v7.10.1, created at 2025-10-19 21:13 +0800

1from bs4 import BeautifulSoup 

2from bs4.element import Tag 

3from msgspec import Struct 

4from src.scrape.base_scraper import BaseScraper 

5from src.utility.lib import MsgSpecJSONResponse 

6from src.utility.utils import OtherInfo, PersonInfo, Utils, ViewType 

7from typing import Any, Dict, List, Tuple 

8 

9 

10class InfoScraper(BaseScraper): 

11 def __init__(self, soup: BeautifulSoup, params: Dict, view: ViewType) -> None: 

12 super().__init__(soup, params, view) 

13 

14 self.detail_head = self.soup.select_one("div.p-content-detail__head") or self.soup.select_one("div.p-timeline-mark") 

15 self.detail_foot = self.soup.select_one("div.p-content-detail__foot") or self.soup.select_one("div.p-profile__main") 

16 

17 self.page_number = int(self.params.get("page", 1)) 

18 self.data = {} 

19 

20 def get_response(self) -> Dict[str, Any]: 

21 return { 

22 "data": self.data, 

23 "scrape_date": Utils.get_scrape_date(), 

24 } 

25 

26 def get_logging(self, id: List[int], text: str) -> str: 

27 return f"[{self.view.value}] [ID: {', '.join(str(i) for i in id)}] {text}" 

28 

29 def _get_title(self) -> str: 

30 selectors = ["h2.p-content-detail__title > span", "h2.c-content-box-s__title", "div.p-timeline-mark__title > a"] 

31 for sel in selectors: 

32 if title := self.detail_head.select_one(sel): 

33 return title.find(string=True, recursive=False).text 

34 

35 def _get_original_title(self) -> str | None: 

36 selectors = ["p.p-content-detail__original", "p.c-content-box-s__original"] 

37 for sel in selectors: 

38 if title := self.detail_head.select_one(sel): 

39 return title.text 

40 

41 return None 

42 

43 def _get_synopsis(self) -> str | None: 

44 synopsis = self.detail_head.select_one("#js-content-detail-synopsis") 

45 

46 return synopsis.select_one("content-detail-synopsis").get(":outline").strip('"') if synopsis else None 

47 

48 def _get_rating(self) -> float | str: 

49 selectors = ["div.c2-rating-l__text", "div.c2-rating-m__text", "div.c-rating__score"] 

50 for sel in selectors: 

51 if rating := self.detail_head.select_one(sel): 

52 rating = rating.text 

53 return float(rating) if rating != "-" else rating 

54 

55 def _get_data_mark(self) -> Struct: 

56 selectors = ["div.c-content__counts > div.js-btn-mark", "div.c-content__actions > div.js-btn-mark"] 

57 for sel in selectors: 

58 if data_elem := self.detail_head.select_one(sel): 

59 return MsgSpecJSONResponse.parse(content=data_elem.attrs["data-mark"], type=self.view.mark) 

60 

61 def _get_data_clip(self) -> Struct: 

62 selectors = ["div.c-content__counts > div.js-btn-clip", "div.c-content__actions > div.js-btn-clip"] 

63 for sel in selectors: 

64 if data_elem := self.detail_head.select_one(sel): 

65 return MsgSpecJSONResponse.parse(content=data_elem.attrs["data-clip"], type=self.view.clip) 

66 

67 def _get_link(self) -> str: 

68 return self.soup.select_one("link").attrs["href"] 

69 

70 def _get_official_site(self) -> str | None: 

71 link = self.detail_head.select_one("li.p-content-detail-links__item--official > a") 

72 

73 return link.attrs["href"] if link else None 

74 

75 def _get_poster(self) -> str | None: 

76 poster = self.detail_head.select_one("div.c2-poster-l > img") 

77 

78 return poster.attrs["src"] if poster else None 

79 

80 def _get_production_year(self) -> Tuple[str] | None: 

81 production_year = self.detail_head.select_one("h2.p-content-detail__title a") 

82 

83 return (Utils.create_filmarks_link(production_year.attrs["href"]), int(production_year.text.replace("年", ""))) if production_year else None 

84 

85 def _get_other_info(self, field: OtherInfo) -> str | List[str] | List[Dict[str, Any]] | None: 

86 if field == OtherInfo.GENRE or field == OtherInfo.DISTRIBUTOR: 

87 info_elem = self.detail_head.find("h3", class_="p-content-detail__secondary-info-title", string=lambda s: s.startswith(field.title)) 

88 

89 else: 

90 info_elem = self.detail_head.find("h3", class_="p-content-detail__primary-info-title", string=lambda s: s.startswith(field.title)) 

91 

92 if field in OtherInfo.single_fields(): 

93 return info_elem.text.replace(field.title, "") if info_elem else None 

94 

95 elif self.view == ViewType.ANIME and field == OtherInfo.COUNTRY_OF_ORIGIN: 

96 return [name.text for name in info_elem.find_next_sibling("ul").find_all("li")] if info_elem else None 

97 

98 else: 

99 return [ 

100 Utils.create_other_info( 

101 name=other.text, 

102 link=other.attrs["href"] 

103 ) 

104 for other 

105 in info_elem.find_next_sibling("ul").find_all("a") 

106 ] if info_elem else None 

107 

108 def _get_person_info(self, field: PersonInfo) -> List[Dict[str, Any]] | None: 

109 if field == PersonInfo.CAST: 

110 info_elem = self.detail_head.select_one("div.p-people-list__casts") 

111 

112 return [ 

113 Utils.create_person_info( 

114 name=person.select_one("div.c2-button-tertiary-s-multi-text__text").text, 

115 link=person.select_one("a").attrs["href"], 

116 character=character.text if (character := person.select_one("div.c2-button-tertiary-s-multi-text__subtext")) else "" 

117 ) 

118 for person 

119 in info_elem.select("h4.p-people-list__item") 

120 ] if info_elem else None 

121 

122 else: 

123 info_elem = self.detail_head.find("h3", class_="p-content-detail__people-list-term", string=field.title) 

124 

125 return [ 

126 Utils.create_person_info( 

127 name=person.find("div").text, 

128 link=person.find("a").attrs["href"] 

129 ) 

130 for person 

131 in info_elem.find_next_sibling("ul").find_all("li") 

132 ] if info_elem else None 

133 

134 def _is_reviews_empty(self) -> Tag | None: 

135 condition = self.detail_foot.select_one("div.p2-empty-reviews-message__text") 

136 

137 return condition 

138 

139 def _get_review(self) -> Dict[str, Any]: 

140 return Utils.create_review_info( 

141 user_name=self.detail_foot.select_one("h2.p-profile__name > a").text, 

142 user_link=self.detail_foot.select_one("div.p-profile__content > a").attrs["href"], 

143 review_date=self.detail_head.select_one("time.c-media__date").text, 

144 review_rating=float(rating) if (rating := self.detail_head.select_one("div.c-rating__score").text) != "-" else rating, 

145 review_contents=self.detail_head.select_one("div.p-mark-review").get_text(separator=" ", strip=True), 

146 ) 

147 

148 def _get_review_info(self) -> List[Dict[str, Any]] | None: 

149 info_elem = self.detail_foot.select("div.p-mark") 

150 

151 return [ 

152 Utils.create_review_info( 

153 user_name=review.select_one("div.c2-user-m__heading a").text.replace("の感想・評価", ""), 

154 user_link=review.select_one("div.c2-user-m > a").attrs["href"], 

155 review_date=review.select_one("time.c-media__date").text, 

156 review_rating=float(rating) if (rating := review.select_one("div.c2-rating-s__text").text) != "-" else rating, 

157 review_contents= rev.get_text(separator=" ", strip=True) if (rev := review.select_one("div.p-mark-review")) else "", 

158 review_link=review.select_one("div.c2-user-m__heading a").attrs["href"], 

159 ) 

160 for review 

161 in info_elem 

162 ] if info_elem else None