Code source de geotribu_scraper.spiders.tutos_crawler

#! python3  # noqa: E265

# #############################################################################
# ########## Libraries #############
# ##################################

# Standard library
import logging

# 3rd party library
from scrapy import Spider
from scrapy.http.response import Response
from scrapy.selector import Selector
from scrapy.utils.project import get_project_settings

# project
from geotribu_scraper.items import ArticleItem


# #############################################################################
# ########## Classes ###############
# ##################################
[docs]class TutorielsSpider(Spider): """Specific spider for tutoriels.""" settings = get_project_settings() name = "geotribu_tutoriels" # allowed_domains = ["stackoverflow.com"] start_urls = [settings.get("DEFAULT_URL_BASE") + "node/19/"]
[docs] def parse(self, response: Response): """Parse URLs. :param Response response: HTTP response returned by URL requested """ tutos = Selector(response).css("div.views-row") logging.info( "La page {} contient {} tutoriels".format(response.url, len(tutos)) ) for tuto in tutos: # url tuto_rel_url = tuto.css("a::attr(href)").get() if tuto_rel_url is not None: yield response.follow(tuto_rel_url, callback=self.parse_article)
[docs] def parse_article(self, response: Response): """Specific parsing logic for Geotribu tutoriels :param Response response: HTTP response returned by URL requested """ logging.info( "Start parsing ARTICLE: {}".format(response.css("title::text").getall()[0]) ) item = ArticleItem() # contenu de la art art = response.css("article")[0] # titre art_title_section = art.css("div.title-and-meta") art_title = art_title_section.css("h2.node__title a::text").get() item["title"] = art_title # type d'article - jusqu'en 2013, les revues de presse étaient des tutoriels # comme les autres et n'étaient pas aussi structurées if "revue de presse" in art_title.lower(): item["kind"] = "rdp" else: item["kind"] = "art" # url art_rel_url = art_title_section.css("h2.node__title a::attr(href)").get() item["url_full"] = art_rel_url # shortlink - lien court contenant l'identifiant du noeud de contenu Drupal shortlink = response.xpath('//link[@rel="shortlink"]') if shortlink: short_url_content = shortlink.attrib.get("href") if "node" in short_url_content: item["drupal_node"] = int(short_url_content.split("/")[-1]) # date de publication art_date = art.css("div.date") art_date_day = art_date.css("span.day::text").get() art_date_month = art_date.css("span.month::text").get() art_date_year = art_date.css("span.year::text").get() item["published_date"] = (art_date_day, art_date_month, art_date_year) # tags item["tags"] = art_title_section.css("span.taxonomy-tag a::text").getall() # récupération de l'intro try: item["intro"] = art.css("div.field-name-field-introduction").getall()[0] except IndexError: logging.debug("Article doesn't have introduction.") item["intro"] = None # corps art_raw_body = art.css("div.field-name-body") art_out_body = [] for el in art_raw_body: art_out_body.append(el.get()) item["body"] = art_out_body # images URLS (converted into absolute) item["image_urls"] = [ response.urljoin(i) for i in art.css("img").xpath("@src").getall() ] # author author_block = art.css("div.view.view-about-author") if author_block: # author thumbnail thumbnail = ( art.css("div.view.view-about-author").css("img").xpath("@src").getall() ) if thumbnail and len(thumbnail): thumbnail = ( art.css("div.view.view-about-author") .css("img") .xpath("@src") .getall()[0] ) else: thumbnail = "?" # author name name = ( author_block.css("div.views-field.views-field-field-nom-complet") .css("div.field-content::text") .getall() ) if name and len(name): author_block.css("div.views-field.views-field-field-nom-complet").css( "div.field-content::text" ).getall()[0] else: name = "?" item["author"] = { "thumbnail": thumbnail, "name": name[0], "description": author_block.css( "div.views-field.views-field-field-description p" ).getall(), } else: item["author"] = { "thumbnail": "?", "name": art_title_section.css("span.username a::text").get(), "description": "", } yield item
# ############################################################################# # ##### Main ####################### # ################################## if __name__ == "__main__": pass