Coverage for src\scrape\base_scraper.py: 100%

49 statements  

« prev     ^ index     » next       coverage.py v7.10.1, created at 2025-10-19 21:13 +0800

1from bs4 import BeautifulSoup 

2from fastapi import Request 

3from requests import Session 

4from requests.exceptions import RequestException 

5from src.utility.endpoints import Endpoint 

6from src.utility.lib import CustomException, Logger 

7from src.utility.utils import EndpointType, Utils, ViewType 

8from typing import Dict, Type, TypeVar 

9from urllib.parse import urlencode 

10 

11T = TypeVar("T", bound="BaseScraper") 

12 

13 

14class BaseScraper: 

15 def __init__(self, soup: BeautifulSoup, params: Dict, view: ViewType) -> None: 

16 self.soup = soup 

17 self.params = params 

18 self.view = view 

19 

20 @classmethod 

21 def scrape(cls: Type[T], endpoint: Endpoint, req: Request) -> T | None: 

22 endpoint = endpoint.value 

23 

24 if endpoint.type == EndpointType.QUERY: 

25 params = req.query_params 

26 url = Utils.create_filmarks_link(endpoint.path + "?" + urlencode(params)) 

27 

28 elif endpoint.type == EndpointType.PATH: 

29 params = req.path_params 

30 url = Utils.create_filmarks_link(endpoint.path.format(**params)) 

31 

32 elif endpoint.type == EndpointType.COMBINED: 

33 params = {**req.query_params, **req.path_params} 

34 url = Utils.create_filmarks_link(endpoint.path.format(**req.path_params) + "?" + urlencode(req.query_params)) 

35 

36 else: 

37 raise ValueError(f"Unexpected EndpointType: {endpoint.type}") # pragma: no cover 

38 

39 try: 

40 with Session() as session: 

41 resp = session.get(url=url, headers=Utils.FILMARKS_REQUEST_HEADERS) 

42 soup = BeautifulSoup(resp.text, "lxml") 

43 

44 cls._raise_if_page_service_unavailable(soup) 

45 cls._raise_if_page_not_found(soup) 

46 

47 return cls(soup, params, endpoint.view) 

48 

49 except RequestException as e: 

50 Logger.err(f"Request to Filmarks failed: '{e}'") 

51 raise CustomException.service_unavailable() 

52 

53 @staticmethod 

54 def _raise_if_page_service_unavailable(soup: BeautifulSoup) -> None: 

55 status = soup.select_one("p.main__text") 

56 

57 if status and status.text.strip().startswith("一時的にアクセスできない状態です。"): 

58 Logger.err("Filmarks is temporarily unavailable") 

59 raise CustomException.service_unavailable() 

60 

61 @staticmethod 

62 def _raise_if_page_not_found(soup: BeautifulSoup) -> None: 

63 status = soup.select_one("p.main__status-ja") 

64 

65 if status and status.text.strip() == "お探しのページは見つかりません。": 

66 Logger.err("Invalid Filmarks page requested") 

67 raise CustomException.not_found()