Code source de geotribu_scraper.spiders.tutos_crawler

#! python3  # noqa: E265

# #############################################################################
# ########## Libraries #############
# ##################################

# Standard library
import logging

# 3rd party library
from scrapy import Spider
from scrapy.http.response import Response
from scrapy.selector import Selector
from scrapy.utils.project import get_project_settings

# project
from geotribu_scraper.items import ArticleItem


# #############################################################################
# ########## Classes ###############
# ##################################
[docs]class TutorielsSpider(Spider):
    """Specific spider for tutoriels."""

    settings = get_project_settings()
    name = "geotribu_tutoriels"
    # allowed_domains = ["stackoverflow.com"]
    start_urls = [settings.get("DEFAULT_URL_BASE") + "node/19/"]

[docs]    def parse(self, response: Response):
        """Parse URLs.

        :param Response response: HTTP response returned by URL requested
        """
        tutos = Selector(response).css("div.views-row")
        logging.info(
            "La page {} contient {} tutoriels".format(response.url, len(tutos))
        )
        for tuto in tutos:
            # url
            tuto_rel_url = tuto.css("a::attr(href)").get()

            if tuto_rel_url is not None:
                yield response.follow(tuto_rel_url, callback=self.parse_article)

[docs]    def parse_article(self, response: Response):
        """Specific parsing logic for Geotribu tutoriels

        :param Response response: HTTP response returned by URL requested
        """
        logging.info(
            "Start parsing ARTICLE: {}".format(response.css("title::text").getall()[0])
        )
        item = ArticleItem()

        # contenu de la art
        art = response.css("article")[0]

        # titre
        art_title_section = art.css("div.title-and-meta")
        art_title = art_title_section.css("h2.node__title a::text").get()
        item["title"] = art_title

        # type d'article - jusqu'en 2013, les revues de presse étaient des tutoriels
        # comme les autres et n'étaient pas aussi structurées
        if "revue de presse" in art_title.lower():
            item["kind"] = "rdp"
        else:
            item["kind"] = "art"

        # url
        art_rel_url = art_title_section.css("h2.node__title a::attr(href)").get()
        item["url_full"] = art_rel_url

        # shortlink - lien court contenant l'identifiant du noeud de contenu Drupal
        shortlink = response.xpath('//link[@rel="shortlink"]')
        if shortlink:
            short_url_content = shortlink.attrib.get("href")
            if "node" in short_url_content:
                item["drupal_node"] = int(short_url_content.split("/")[-1])

        # date de publication
        art_date = art.css("div.date")
        art_date_day = art_date.css("span.day::text").get()
        art_date_month = art_date.css("span.month::text").get()
        art_date_year = art_date.css("span.year::text").get()
        item["published_date"] = (art_date_day, art_date_month, art_date_year)

        # tags
        item["tags"] = art_title_section.css("span.taxonomy-tag a::text").getall()

        # récupération de l'intro
        try:
            item["intro"] = art.css("div.field-name-field-introduction").getall()[0]
        except IndexError:
            logging.debug("Article doesn't have introduction.")
            item["intro"] = None

        # corps
        art_raw_body = art.css("div.field-name-body")
        art_out_body = []
        for el in art_raw_body:
            art_out_body.append(el.get())

        item["body"] = art_out_body

        # images URLS (converted into absolute)
        item["image_urls"] = [
            response.urljoin(i) for i in art.css("img").xpath("@src").getall()
        ]

        # author
        author_block = art.css("div.view.view-about-author")
        if author_block:
            # author thumbnail
            thumbnail = (
                art.css("div.view.view-about-author").css("img").xpath("@src").getall()
            )
            if thumbnail and len(thumbnail):
                thumbnail = (
                    art.css("div.view.view-about-author")
                    .css("img")
                    .xpath("@src")
                    .getall()[0]
                )
            else:
                thumbnail = "?"

            # author name
            name = (
                author_block.css("div.views-field.views-field-field-nom-complet")
                .css("div.field-content::text")
                .getall()
            )
            if name and len(name):
                author_block.css("div.views-field.views-field-field-nom-complet").css(
                    "div.field-content::text"
                ).getall()[0]
            else:
                name = "?"

            item["author"] = {
                "thumbnail": thumbnail,
                "name": name[0],
                "description": author_block.css(
                    "div.views-field.views-field-field-description p"
                ).getall(),
            }
        else:
            item["author"] = {
                "thumbnail": "?",
                "name": art_title_section.css("span.username a::text").get(),
                "description": "",
            }

        yield item


# #############################################################################
# ##### Main #######################
# ##################################
if __name__ == "__main__":
    pass