Source code for sciencescraper.pmc.pmc_extract

"""
Functions that extract information from the raw text of PubMed Central articles.
"""

import re


[docs] def get_title(pmc_article): """ Returns the title of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- title : str The title of the article """ if pmc_article.find("article-title") is None: return None title = pmc_article.find("article-title").text title = title.replace("\n", "").replace("\t", "") return title
[docs] def get_authors(pmc_article): """ Returns the authors of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- authors : list The authors of the article """ authors = [] for author in pmc_article.find_all("contrib"): if author.find("name") is not None: surname = ( author.find("surname").text.strip() if author.find("surname") else "" ) given_names = ( author.find("given-names").text.strip() if author.find("given-names") else "" ) full_name = f"{given_names} {surname}".strip() authors.append(full_name) return authors
[docs] def get_journal(pmc_article): """ Returns the journal of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- journal : str The journal of the article """ if pmc_article.find("journal-title") is None: return None return pmc_article.find("journal-title").text
[docs] def get_publisher(pmc_article): """ Returns the publisher of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- publisher : str The publisher of the article """ if pmc_article.find("publisher-name") is None: return None return pmc_article.find("publisher-name").text
[docs] def get_article_type(pmc_article): """ Returns the article type of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- article_type : str The article type of the article """ if pmc_article.find("article")["article-type"] is None: return None return pmc_article.find("article")["article-type"]
[docs] def get_doi(pmc_article): """ Returns the DOI of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- doi : str The DOI of the article """ if pmc_article.find("article-id", {"pub-id-type": "doi"}) is None: return None return pmc_article.find("article-id", {"pub-id-type": "doi"}).text
[docs] def get_pmc_id(pmc_article): """ Returns the PMC ID of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- pmc_id : str The PMC ID of the article """ if pmc_article.find("article-id", {"pub-id-type": "pmc"}) is None: return None return pmc_article.find("article-id", {"pub-id-type": "pmc"}).text
[docs] def get_date(pmc_article): """ Returns the date of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- date : str The date of the article """ if pmc_article.find("pub-date") is None: return None pub_date = pmc_article.find("pub-date") day = pub_date.find("day").text if pub_date.find("day") else "" month = pub_date.find("month").text if pub_date.find("month") else "" year = pub_date.find("year").text if pub_date.find("year") else "" date = f"{year}-{month}-{day}" return date
[docs] def get_url(pmc_article): """ Returns the URL of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- url : str The URL of the article """ pmc_id = get_pmc_id(pmc_article) if pmc_id is None: return None return f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"
[docs] def get_keywords(pmc_article): """ Returns the keywords of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- keywords : list The keywords of the article """ keywords = [] for keyword in pmc_article.find_all("kwd"): keywords.append(keyword.text) return keywords
[docs] def get_abstract(pmc_article): """ Returns the abstract of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- abstract : str The abstract of the article """ if pmc_article.find("abstract") is None: return None abstract = pmc_article.find("abstract").text # clean up text abstract = abstract.replace("−", "") abstract = " ".join(abstract.split()) abstract = clean_references(abstract) return abstract
[docs] def get_intro(pmc_article): """ Returns the introduction section of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- intro : str The introduction section of the article """ if pmc_article.find("body") is None: return None body = pmc_article.find("body") for fig in body.find_all("fig"): fig.decompose() for table in body.find_all("table-wrap"): table.decompose() intro = "" for sec in body.find_all("sec"): title_tag = sec.find("title") if title_tag is None: # Check if title tag exists continue title = title_tag.get_text() if "introduction" in title.lower(): content = sec.find_all("p") section_content = "" for para in content: section_content += para.get_text(separator=" ") + " " # Clean up text section_content = section_content.replace("−", "") section_content = " ".join(section_content.split()) intro += f"{section_content}" intro = clean_references(intro) return intro
[docs] def get_methods(pmc_article): """ Returns the methods section of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- methods : str The methods section of the article """ if pmc_article.find("body") is None: return None body = pmc_article.find("body") for fig in body.find_all("fig"): fig.decompose() for table in body.find_all("table-wrap"): table.decompose() methods = "" for sec in body.find_all("sec"): title_tag = sec.find("title") if title_tag is None: # Check if title tag exists continue title = title_tag.get_text() if "method" in title.lower(): content = sec.find_all("p") section_content = "" for para in content: section_content += para.get_text(separator=" ") + " " # Clean up text section_content = section_content.replace("−", "") section_content = " ".join(section_content.split()) methods += f"{section_content}\n\n" methods = clean_references(methods) return methods
[docs] def get_discussion(pmc_article): """ Returns the discussion section of the article Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object Returns ------- discussion : str The discussion section of the article """ if pmc_article.find("body") is None: return None body = pmc_article.find("body") for fig in body.find_all("fig"): fig.decompose() for table in body.find_all("table-wrap"): table.decompose() discussion = "" for sec in body.find_all("sec"): title_tag = sec.find("title") if title_tag is None: # Check if title tag exists continue title = title_tag.get_text() if "discussion" in title.lower(): content = sec.find_all("p") section_content = "" for para in content: section_content += para.get_text(separator=" ") + " " # Clean up text section_content = section_content.replace("−", "") section_content = " ".join(section_content.split()) discussion += f"{section_content}\n\n" discussion = clean_references(discussion) return discussion
[docs] def clean_references(pmc_article): """ Removes the reference numbers from the article which can clutter the text in terms of readability and analysis by a machine learning model. """ pattern = r"\[[\d\s,]+\]" cleaned_text = re.sub(pattern, "", pmc_article) return cleaned_text