Source code for sciencescraper.sciencedirect.scidir_scrape

"""
Functions for retrieving the raw text of ScienceDirect articles.
"""

import requests
from .scidir_clean import clean_fulltext
from .scidir_extract import (
    get_title,
    get_authors,
    get_journal,
    get_publisher,
    get_article_type,
    get_date,
    get_url,
    get_doi,
    get_open_access,
    get_keywords,
    get_abstract,
    get_methods,
    get_results,
    get_discussion,
    get_references,
)


[docs] def get_article_info(api_key, doi=None, pii=None, url=None, chunk_size=None): """ Get the full text of a ScienceDirect article using the ScienceDirect API. Parameters ---------- api_key : str The API key for the ScienceDirect API. API keys can be obtained by creating an account at https://dev.elsevier.com/. doi : str, optional The DOI of the article to be scraped. pii : str, optional The PII of the article to be scraped. url : str, optional The URL of the article to be scraped. chunk_size : int, optional The size of the chunks to split the full text into. Default is None. Returns ------- dict A dictionary containing the title, authors, journal, year, URL, open access status, keywords, abstract, methods, results, discussion, and references of the article. """ if doi: xml_text = get_xml_doi(api_key, doi) elif pii: xml_text = get_xml_pii(api_key, pii) elif url: xml_text = get_xml_url(api_key, url) else: return "Invalid input" # Extract article information title = get_title(xml_text) authors = get_authors(xml_text) journal = get_journal(xml_text) publisher = get_publisher(xml_text) article_type = get_article_type(xml_text) date = get_date(xml_text) url = get_url(xml_text) doi = get_doi(xml_text) open_access = get_open_access(xml_text) keywords = get_keywords(xml_text) abstract = get_abstract(xml_text) methods = get_methods(xml_text) results = get_results(xml_text) discussion = get_discussion(xml_text) references = get_references(xml_text) fulltext = clean_fulltext(xml_text, chunk_size) # Create dictionary of article information article_info = { "title": title, "authors": authors, "journal": journal, "publisher": publisher, "article_type": article_type, "date": date, "url": url, "doi": doi, "open_access": open_access, "keywords": keywords, "abstract": abstract, "methods": methods, "results": results, "discussion": discussion, "references": references, "full_text": fulltext, } return article_info
[docs] def get_full_text(api_key, doi=None, pii=None, url=None, chunk_size=None): """ Get the full text of a ScienceDirect article using the ScienceDirect API. Parameters ---------- api_key : str The API key for the ScienceDirect API. API keys can be obtained by creating an account at https://dev.elsevier.com/. doi : str, optional The DOI of the article to be scraped. pii : str, optional The PII of the article to be scraped. url : str, optional The URL of the article to be scraped. chunk_size : int, optional The size of the chunks to split the full text into. Default is None. Returns ------- str The full text of the article. """ if doi: xml_text = get_xml_doi(api_key, doi) elif pii: xml_text = get_xml_pii(api_key, pii) elif url: xml_text = get_xml_url(api_key, url) else: return "Invalid input" return clean_fulltext(xml_text, chunk_size)
[docs] def get_xml_doi(api_key, doi): """ Get the raw XML text from an article using the ScienceDirect API and the article's DOI. Parameters ---------- api_key : str The API key for the ScienceDirect API. API keys can be obtained by creating an account at https://dev.elsevier.com/. doi : str The DOI of the article to be scraped. Returns ------- str The raw XML text of the article. Raises ------ requests.exceptions.HTTPError If the request to the ScienceDirect API fails. """ # Make request to ScienceDirect API url = f"https://api.elsevier.com/content/article/doi/{doi}" headers = {"Accept": "text/xml", "X-ELS-APIKey": api_key} response = requests.get(url, headers=headers) # Check if the request was successful if response.status_code == 200: xml_text = response.text return xml_text else: response.raise_for_status() return None
[docs] def get_xml_pii(api_key, pii): """ Get the raw XML text from an article using the ScienceDirect API and the article's PII. Parameters ---------- api_key : str The API key for the ScienceDirect API. API keys can be obtained by creating an account at https://dev.elsevier.com/. pii : str The PII of the article to be scraped. Returns ------- str The raw XML text of the article. Raises ------ requests.exceptions.HTTPError If the request to the ScienceDirect API fails. """ # Make request to ScienceDirect API url = f"https://api.elsevier.com/content/article/pii/{pii}" headers = {"Accept": "text/xml", "X-ELS-APIKey": api_key} response = requests.get(url, headers=headers) # Check if the request was successful if response.status_code == 200: xml_text = response.text return xml_text else: response.raise_for_status() return None
[docs] def get_xml_url(api_key, url): """ Get the raw XML text from an article using the ScienceDirect API and the article's URL. Parameters ---------- api_key : str The API key for the ScienceDirect API. API keys can be obtained by creating an account at https://dev.elsevier.com/. url : str The URL of the article to be scraped. Returns ------- str The raw XML text of the article. Raises ------ requests.exceptions.HTTPError If the request to the ScienceDirect API fails. """ if "sciencedirect.com/science/article/pii/" in url: pii = url.split("sciencedirect.com/science/article/pii/")[1] return get_xml_pii(api_key, pii) elif "sciencedirect.com/science/article/doi" in url: doi = url.split("sciencedirect.com/science/article/doi")[1] return get_xml_doi(api_key, doi) else: return "Invalid URL"