Source code for sciencescraper.sciencedirect.scidir_extract
"""
Functions that extract information from the raw text of ScienceDirect articles.
"""
from bs4 import BeautifulSoup
import re
[docs]
def get_title(xml_text):
"""
Get the title of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The title of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
title_tag = soup.find("dc:title")
if title_tag:
title = title_tag.text.strip()
else:
title = "Not found"
return title
[docs]
def get_authors(xml_text):
"""
Get the authors of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
list
The authors of the article in the format [First Name Last Name].
"""
soup = BeautifulSoup(xml_text, "xml")
author_tags = soup.find_all("dc:creator")
authors = (
[author.text.strip() for author in author_tags]
if author_tags
else ["Not found."]
)
formatted_authors = []
for author in authors:
# Check if the author name is in the expected format
if ', ' in author:
last_name, first_name = author.split(', ', 1)
formatted_authors.append(f"{first_name} {last_name}")
else:
formatted_authors.append(author)
return formatted_authors
[docs]
def get_journal(xml_text):
"""
Get the journal of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The journal of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
journal_tag = soup.find("prism:publicationName")
if journal_tag:
journal = journal_tag.text.strip()
else:
journal = "Not found"
return journal
[docs]
def get_publisher(xml_text):
"""
Get the publisher of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The publisher of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
publisher_tag = soup.find("prism:publisher")
if publisher_tag:
publisher = publisher_tag.text.strip()
else:
publisher = "Not found"
phrases_to_remove = ['The Authors.', 'The Author.', 'The Author(s).']
pattern = '|'.join(phrases_to_remove)
cleaned_publisher = re.sub(pattern, '', publisher)
return cleaned_publisher
[docs]
def get_article_type(xml_text):
"""
Get the article type of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The article type of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
article_type_tag = soup.find("prism:aggregationType")
if article_type_tag:
article_type = article_type_tag.text.strip()
else:
article_type = "Not found"
return article_type
[docs]
def get_date(xml_text):
"""
Get the date of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The date of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
date_tag = soup.find("prism:coverDate")
if date_tag:
date = date_tag.text.strip()
else:
date = "Not found"
return date
[docs]
def get_url(xml_text):
"""
Get the URL of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The URL of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
scidir_link = soup.find("link", {"rel": "scidir"})
if scidir_link:
url = scidir_link["href"]
else:
pii = get_pii(xml_text)
url = f"https://www.sciencedirect.com/science/article/pii/{pii}"
return url
[docs]
def get_doi(xml_text):
"""
Get the DOI of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The DOI of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
doi_tag = soup.find("prism:doi")
if doi_tag:
doi = doi_tag.text.strip()
else:
doi = "Not found"
return doi
[docs]
def get_pii(xml_text):
"""
Get the PII of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The PII of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
pii_tag = soup.find("pii")
if pii_tag:
pii = pii_tag.text.strip()
else:
pii = "Not found"
return pii
[docs]
def get_open_access(xml_text):
"""
Get the open access status of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The open access status of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
open_access_tag = soup.find("openaccess")
if open_access_tag:
open_access = open_access_tag.text.strip()
if open_access == "0":
open_access = "No"
else:
open_access = "Yes"
else:
open_access = "Not found"
return open_access
[docs]
def get_keywords(xml_text):
"""
Get the keywords of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
list
A list of the keywords of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
keywords_tag = soup.find_all("dcterms:subject")
if keywords_tag:
keywords = [keyword.text.strip() for keyword in keywords_tag]
else:
keywords = ["Not found"]
return keywords
[docs]
def get_abstract(xml_text):
"""
Get the abstract of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The abstract of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
abstract_tag = soup.find("dc:description")
if abstract_tag:
abstract = abstract_tag.text.strip()
abstract = " ".join(
abstract.split()
) # Remove extra whitespace that sometimes appears in the abstract
if abstract == "":
return "Abstract not found in article."
return abstract
else:
return "Not found."
[docs]
def get_methods(xml_text):
"""
Get the methods section of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The methods section of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
all_sections = soup.find_all("ce:section")
for section in all_sections:
section_title = section.find("ce:section-title")
if (
section_title
and "method" in section_title.text.lower()
and "star★methods" not in section_title.text.lower()
):
methods_text = "Methods: "
for para in section.find_all("ce:para"):
# Skip any key resources table
if para.find("key resources table"):
continue
methods_text += para.text.strip() + " "
methods = methods_text.strip()
clean_methods = (
methods.replace("\xa0", "").replace(r"\u202", "").replace("\n", "")
)
return clean_methods
return "Methods section not labeled in article."
[docs]
def get_results(xml_text):
"""
Get the results section of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The results section of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
all_sections = soup.find_all("ce:section")
for section in all_sections:
section_title = section.find("ce:section-title")
if section_title and "results" in section_title.text.lower():
results_text = "Results: "
for para in section.find_all("ce:para"):
results_text += para.text.strip() + " "
results = results_text.strip()
clean_results = (
results.replace("\xa0", "").replace(r"\u202", "").replace("\n", "")
)
return clean_results
return "Results section not labeled in article."
[docs]
def get_discussion(xml_text):
"""
Get the discussion section of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The discussion section of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
all_sections = soup.find_all("ce:section")
for section in all_sections:
section_title = section.find("ce:section-title")
if section_title and "discussion" in section_title.text.lower():
if (
section_title
and "result" in section_title.text.lower()
and "discussion" in section_title.text.lower()
):
return "Discussion included in results section."
discussion_text = "Discussion: "
for para in section.find_all("ce:para"):
discussion_text += para.text.strip() + " "
discussion = discussion_text.strip()
clean_discussion = (
discussion.replace("\xa0", "").replace(r"\u202", "").replace("\n", "")
)
return clean_discussion
return "Discussion section not labeled in article."
[docs]
def get_references(xml_text):
"""
Get the references section of a ScienceDirect article from the article's raw XML text.
Parameters
----------
xml_text : str
The raw XML text of an article.
Returns
-------
str
The title of the references used in the references section of the article.
"""
soup = BeautifulSoup(xml_text, "xml")
all_sections = soup.find_all("ce:bibliography-sec")
reference_titles = []
for section in all_sections:
for reference in section.find_all("ce:bib-reference"):
maintitle_tag = reference.find("sb:maintitle")
if maintitle_tag:
reference_titles.append(maintitle_tag.get_text().strip())
return reference_titles