Source code for sciencescraper.pmc.pmc_scrape
"""
Functions for retrieving the raw text of PubMed Central articles.
"""
import requests
from bs4 import BeautifulSoup
from .pmc_extract import (
get_title,
get_authors,
get_journal,
get_publisher,
get_article_type,
get_doi,
get_pmc_id,
get_date,
get_url,
get_keywords,
get_abstract,
get_intro,
get_methods,
get_discussion,
)
from .pmc_clean import clean_full_text
[docs]
def fetch_pmc_article(pmc_id):
"""
Fetches an article from PMC given a PMC ID
Parameters
----------
pmc_id : str
The PMC ID of the article
Returns
-------
soup : BeautifulSoup
The article as a BeautifulSoup object
"""
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {"db": "pmc", "id": pmc_id, "retmode": "xml"}
response = requests.get(url, params=params)
if response.status_code != 200:
print(f"Failed to fetch article with PMC ID {pmc_id}")
return None
soup = BeautifulSoup(response.text, "xml")
return soup
[docs]
def parse_pmc_article(pmc_article, chunk_size):
"""
Parses an article from PMC
Parameters
----------
pmc_article : BeautifulSoup
The article as a BeautifulSoup object
chunk_size : int
The size of the chunks to split the full text into
Returns
-------
article : dict
The parsed article
"""
if pmc_article is None:
return None
article = {
"title": get_title(pmc_article),
"authors": get_authors(pmc_article),
"journal": get_journal(pmc_article),
"publisher": get_publisher(pmc_article),
"article_type": get_article_type(pmc_article),
"doi": get_doi(pmc_article),
"pmc_id": get_pmc_id(pmc_article),
"date": get_date(pmc_article),
"url": get_url(pmc_article),
"keywords": get_keywords(pmc_article),
"abstract": get_abstract(pmc_article),
"introduction": get_intro(pmc_article),
"methods": get_methods(pmc_article),
"discussion": get_discussion(pmc_article),
"full_text": clean_full_text(pmc_article, chunk_size),
}
return article
[docs]
def get_article_info(pmc_id, chunk_size=None):
"""
Fetches and parses an article from PMC given a PMC ID
Parameters
----------
pmc_id : str
The PMC ID of the article
chunk_size : int, optional
The size of the chunks to split the full text into. Default is None.
Returns
-------
article : dict
The parsed article
"""
pmc_article = fetch_pmc_article(pmc_id)
if pmc_article is None:
return None
article = parse_pmc_article(pmc_article, chunk_size)
return article
[docs]
def get_full_text(pmc_id, chunk_size=None):
"""
Fetches the full text of an article from PMC given a PMC ID
Parameters
----------
pmc_id : str
The PMC ID of the article
chunk_size : int, optional
The size of the chunks to split the full text into. Default is None.
Returns
-------
full_text : str
The full text of the article
"""
pmc_article = fetch_pmc_article(pmc_id)
if pmc_article is None:
return None
full_text = clean_full_text(pmc_article, chunk_size)
return full_text