Code source de geotribu_scraper.spiders.rdp_crawler

#! python3  # noqa: E265

# #############################################################################
# ########## Libraries #############
# ##################################

# Standard library
import logging

# 3rd party library
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.utils.project import get_project_settings

# project
from geotribu_scraper.items import GeoRdpItem


# #############################################################################
# ########## Classes ###############
# ##################################
[docs]class GeoRDPSpider(Spider): """Specific spider for revues de presse.""" settings = get_project_settings() name = "geotribu_rdp" # allowed_domains = ["stackoverflow.com"] start_urls = [ settings.get("DEFAULT_URL_BASE") + "revues-de-presse", ]
[docs] def parse(self, response): rdps = Selector(response).css("article") logging.info( "La page {} contient {} revues de presse".format(response.url, len(rdps)) ) for rdp in rdps: # title rdp_title_section = rdp.css("div.title-and-meta") # url rdp_rel_url = rdp_title_section.css("h2.node__title a::attr(href)").get() if rdp_rel_url is not None: yield response.follow(rdp_rel_url, callback=self.parse_rdp) # get next page from bottom pagination to iterate over pages next_page = response.css("li.pager-next a::attr(href)").get() if next_page is not None: yield response.follow(next_page, callback=self.parse)
[docs] def parse_rdp(self, response): logging.info( "Start parsing RDP: {}".format(response.css("title::text").getall()[0]) ) item = GeoRdpItem() # contenu de la rdp rdp = response.css("article")[0] # titre rdp_title_section = rdp.css("div.title-and-meta") rdp_title = rdp_title_section.css("h2.node__title a::text").get() item["title"] = rdp_title # type d'article - jusqu'en 2013, les revues de presse étaient des articles # comme les autres et n'étaient pas aussi structurées if "revue de presse" in rdp_title.lower(): item["kind"] = "rdp" else: item["kind"] = "art" # url - ne contient pas forcément l'identifiant du noeud de contenu Drupal. # Par ex les contenus avec URL personnalisée : /geotribu_reborn/GeoRDP/20150220 rdp_rel_url = rdp_title_section.css("h2.node__title a::attr(href)").get() item["url_full"] = rdp_rel_url # shortlink - lien court contenant l'identifiant du noeud de contenu Drupal shortlink = response.xpath('//link[@rel="shortlink"]') if shortlink: short_url_content = shortlink.attrib.get("href") if "node" in short_url_content: item["drupal_node"] = int(short_url_content.split("/")[-1]) # date de publication rdp_date = rdp.css("div.date") rdp_date_day = rdp_date.css("span.day::text").get() rdp_date_month = rdp_date.css("span.month::text").get() rdp_date_year = rdp_date.css("span.year::text").get() item["published_date"] = (rdp_date_day, rdp_date_month, rdp_date_year) # tags item["tags"] = rdp_title_section.css("span.taxonomy-tag a::text").getall() # récupération de l'intro intro = "" for i in rdp.css("p"): if not i.css("p.directNews"): intro += i.get() else: break item["intro"] = intro # sections item["news_sections"] = rdp.css("p.typeNews::text").getall() # images URLS (converted into absolute) item["image_urls"] = [ response.urljoin(i) for i in rdp.css("img").xpath("@src").getall() ] # news dico_news_by_section = {} start_section = "Non classés" for i in rdp.css("div.news-details, p.typeNews"): if i.css("p.typeNews"): logging.info("Section spotted: {}".format(i.get())) active_section = i.get() dico_news_by_section.setdefault(active_section, []) elif i.css("div.news-details"): dico_news_by_section.get(active_section).append( ( i.css("span.news-title::text").get(), i.css("img").get(), i.css("p, iframe, li").getall(), ) ) else: dico_news_by_section.get(start_section).append(i.get()) item["news_details"] = dico_news_by_section # pseudo author to fit others crawlers structure item["author"] = { "thumbnail": "?", "name": "Geotribu", "description": "", } yield item
# ############################################################################# # ##### Main ####################### # ################################## if __name__ == "__main__": pass