Source code for sciencescraper.pmc.pmc_scrape

"""
Functions for retrieving the raw text of PubMed Central articles.
"""

import requests
from bs4 import BeautifulSoup

from .pmc_extract import (
    get_title,
    get_authors,
    get_journal,
    get_publisher,
    get_article_type,
    get_doi,
    get_pmc_id,
    get_date,
    get_url,
    get_keywords,
    get_abstract,
    get_intro,
    get_methods,
    get_discussion,
)

from .pmc_clean import clean_full_text


[docs] def fetch_pmc_article(pmc_id): """ Fetches an article from PMC given a PMC ID Parameters ---------- pmc_id : str The PMC ID of the article Returns ------- soup : BeautifulSoup The article as a BeautifulSoup object """ url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" params = {"db": "pmc", "id": pmc_id, "retmode": "xml"} response = requests.get(url, params=params) if response.status_code != 200: print(f"Failed to fetch article with PMC ID {pmc_id}") return None soup = BeautifulSoup(response.text, "xml") return soup
[docs] def parse_pmc_article(pmc_article, chunk_size): """ Parses an article from PMC Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object chunk_size : int The size of the chunks to split the full text into Returns ------- article : dict The parsed article """ if pmc_article is None: return None article = { "title": get_title(pmc_article), "authors": get_authors(pmc_article), "journal": get_journal(pmc_article), "publisher": get_publisher(pmc_article), "article_type": get_article_type(pmc_article), "doi": get_doi(pmc_article), "pmc_id": get_pmc_id(pmc_article), "date": get_date(pmc_article), "url": get_url(pmc_article), "keywords": get_keywords(pmc_article), "abstract": get_abstract(pmc_article), "introduction": get_intro(pmc_article), "methods": get_methods(pmc_article), "discussion": get_discussion(pmc_article), "full_text": clean_full_text(pmc_article, chunk_size), } return article
[docs] def get_article_info(pmc_id, chunk_size=None): """ Fetches and parses an article from PMC given a PMC ID Parameters ---------- pmc_id : str The PMC ID of the article chunk_size : int, optional The size of the chunks to split the full text into. Default is None. Returns ------- article : dict The parsed article """ pmc_article = fetch_pmc_article(pmc_id) if pmc_article is None: return None article = parse_pmc_article(pmc_article, chunk_size) return article
[docs] def get_full_text(pmc_id, chunk_size=None): """ Fetches the full text of an article from PMC given a PMC ID Parameters ---------- pmc_id : str The PMC ID of the article chunk_size : int, optional The size of the chunks to split the full text into. Default is None. Returns ------- full_text : str The full text of the article """ pmc_article = fetch_pmc_article(pmc_id) if pmc_article is None: return None full_text = clean_full_text(pmc_article, chunk_size) return full_text