Source code for sciencescraper.sciencedirect.scidir_extract

"""
Functions that extract information from the raw text of ScienceDirect articles.
"""

from bs4 import BeautifulSoup
import re

[docs] def get_title(xml_text): """ Get the title of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The title of the article. """ soup = BeautifulSoup(xml_text, "xml") title_tag = soup.find("dc:title") if title_tag: title = title_tag.text.strip() else: title = "Not found" return title
[docs] def get_authors(xml_text): """ Get the authors of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- list The authors of the article in the format [First Name Last Name]. """ soup = BeautifulSoup(xml_text, "xml") author_tags = soup.find_all("dc:creator") authors = ( [author.text.strip() for author in author_tags] if author_tags else ["Not found."] ) formatted_authors = [] for author in authors: # Check if the author name is in the expected format if ', ' in author: last_name, first_name = author.split(', ', 1) formatted_authors.append(f"{first_name} {last_name}") else: formatted_authors.append(author) return formatted_authors
[docs] def get_journal(xml_text): """ Get the journal of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The journal of the article. """ soup = BeautifulSoup(xml_text, "xml") journal_tag = soup.find("prism:publicationName") if journal_tag: journal = journal_tag.text.strip() else: journal = "Not found" return journal
[docs] def get_publisher(xml_text): """ Get the publisher of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The publisher of the article. """ soup = BeautifulSoup(xml_text, "xml") publisher_tag = soup.find("prism:publisher") if publisher_tag: publisher = publisher_tag.text.strip() else: publisher = "Not found" phrases_to_remove = ['The Authors.', 'The Author.', 'The Author(s).'] pattern = '|'.join(phrases_to_remove) cleaned_publisher = re.sub(pattern, '', publisher) return cleaned_publisher
[docs] def get_article_type(xml_text): """ Get the article type of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The article type of the article. """ soup = BeautifulSoup(xml_text, "xml") article_type_tag = soup.find("prism:aggregationType") if article_type_tag: article_type = article_type_tag.text.strip() else: article_type = "Not found" return article_type
[docs] def get_date(xml_text): """ Get the date of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The date of the article. """ soup = BeautifulSoup(xml_text, "xml") date_tag = soup.find("prism:coverDate") if date_tag: date = date_tag.text.strip() else: date = "Not found" return date
[docs] def get_url(xml_text): """ Get the URL of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The URL of the article. """ soup = BeautifulSoup(xml_text, "xml") scidir_link = soup.find("link", {"rel": "scidir"}) if scidir_link: url = scidir_link["href"] else: pii = get_pii(xml_text) url = f"https://www.sciencedirect.com/science/article/pii/{pii}" return url
[docs] def get_doi(xml_text): """ Get the DOI of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The DOI of the article. """ soup = BeautifulSoup(xml_text, "xml") doi_tag = soup.find("prism:doi") if doi_tag: doi = doi_tag.text.strip() else: doi = "Not found" return doi
[docs] def get_pii(xml_text): """ Get the PII of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The PII of the article. """ soup = BeautifulSoup(xml_text, "xml") pii_tag = soup.find("pii") if pii_tag: pii = pii_tag.text.strip() else: pii = "Not found" return pii
[docs] def get_open_access(xml_text): """ Get the open access status of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The open access status of the article. """ soup = BeautifulSoup(xml_text, "xml") open_access_tag = soup.find("openaccess") if open_access_tag: open_access = open_access_tag.text.strip() if open_access == "0": open_access = "No" else: open_access = "Yes" else: open_access = "Not found" return open_access
[docs] def get_keywords(xml_text): """ Get the keywords of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- list A list of the keywords of the article. """ soup = BeautifulSoup(xml_text, "xml") keywords_tag = soup.find_all("dcterms:subject") if keywords_tag: keywords = [keyword.text.strip() for keyword in keywords_tag] else: keywords = ["Not found"] return keywords
[docs] def get_abstract(xml_text): """ Get the abstract of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The abstract of the article. """ soup = BeautifulSoup(xml_text, "xml") abstract_tag = soup.find("dc:description") if abstract_tag: abstract = abstract_tag.text.strip() abstract = " ".join( abstract.split() ) # Remove extra whitespace that sometimes appears in the abstract if abstract == "": return "Abstract not found in article." return abstract else: return "Not found."
[docs] def get_methods(xml_text): """ Get the methods section of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The methods section of the article. """ soup = BeautifulSoup(xml_text, "xml") all_sections = soup.find_all("ce:section") for section in all_sections: section_title = section.find("ce:section-title") if ( section_title and "method" in section_title.text.lower() and "star★methods" not in section_title.text.lower() ): methods_text = "Methods: " for para in section.find_all("ce:para"): # Skip any key resources table if para.find("key resources table"): continue methods_text += para.text.strip() + " " methods = methods_text.strip() clean_methods = ( methods.replace("\xa0", "").replace(r"\u202", "").replace("\n", "") ) return clean_methods return "Methods section not labeled in article."
[docs] def get_results(xml_text): """ Get the results section of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The results section of the article. """ soup = BeautifulSoup(xml_text, "xml") all_sections = soup.find_all("ce:section") for section in all_sections: section_title = section.find("ce:section-title") if section_title and "results" in section_title.text.lower(): results_text = "Results: " for para in section.find_all("ce:para"): results_text += para.text.strip() + " " results = results_text.strip() clean_results = ( results.replace("\xa0", "").replace(r"\u202", "").replace("\n", "") ) return clean_results return "Results section not labeled in article."
[docs] def get_discussion(xml_text): """ Get the discussion section of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The discussion section of the article. """ soup = BeautifulSoup(xml_text, "xml") all_sections = soup.find_all("ce:section") for section in all_sections: section_title = section.find("ce:section-title") if section_title and "discussion" in section_title.text.lower(): if ( section_title and "result" in section_title.text.lower() and "discussion" in section_title.text.lower() ): return "Discussion included in results section." discussion_text = "Discussion: " for para in section.find_all("ce:para"): discussion_text += para.text.strip() + " " discussion = discussion_text.strip() clean_discussion = ( discussion.replace("\xa0", "").replace(r"\u202", "").replace("\n", "") ) return clean_discussion return "Discussion section not labeled in article."
[docs] def get_references(xml_text): """ Get the references section of a ScienceDirect article from the article's raw XML text. Parameters ---------- xml_text : str The raw XML text of an article. Returns ------- str The title of the references used in the references section of the article. """ soup = BeautifulSoup(xml_text, "xml") all_sections = soup.find_all("ce:bibliography-sec") reference_titles = [] for section in all_sections: for reference in section.find_all("ce:bib-reference"): maintitle_tag = reference.find("sb:maintitle") if maintitle_tag: reference_titles.append(maintitle_tag.get_text().strip()) return reference_titles