Source code for sciencescraper.pmc.pmc_search
"""
Functions for searching for articles on PubMed Central.
"""
import requests
from bs4 import BeautifulSoup
import time
from .pmc_scrape import get_article_info
[docs]
def search_pmc(
query,
sort="relevance",
mindate=None,
maxdate=None,
reldate=None,
retstart=0,
retmax=20,
):
"""
Searches PMC for articles given a query
Parameters
----------
query : str
The query to search for
sort : str, optional
The sorting order for the search results. Options are:
- "relevance": Sort by relevance
- "pub_date": Sort by publication date in descending order
- "JournalName": Sort by journal in ascending order
- "Author": Sort by first author in ascending order
mindate : str, optional
The minimum date for the search results. Format is "YYYY/MM/DD", "YYYY/MM", or "YYYY". Must also provide maxdate
maxdate : str, optional
The maximum date for the search results. Format is "YYYY/MM/DD", "YYYY/MM", or "YYYY". Must also provide mindate
reldate : str, optional
The number of days to search back from the current date.
retstart : int, optional
The index of the first article to return
retmax : int, optional
The maximum number of articles to return
Returns
-------
pmc_ids : list
The PMC IDs of the search results
"""
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "pmc",
"term": f"{query} AND free fulltext[filter]",
"sort": sort,
"datetype": "pdat",
"retstart": retstart,
"retmax": retmax,
}
if mindate is not None:
params["mindate"] = mindate
if maxdate is not None:
params["maxdate"] = maxdate
if reldate is not None:
params["reldate"] = reldate
response = requests.get(url, params=params)
if response.status_code != 200:
print(f"Failed to fetch PMC IDs for query {query}")
return None
soup = BeautifulSoup(response.text, "xml")
pmc_ids = [id.text for id in soup.find_all("Id")]
return pmc_ids
[docs]
def check_new_articles(query, days, chunk_size=None):
"""
Get open access articles from PubMed Central that have been published after a specified date.
Parameters
----------
query : str
The query to search for
days : int
The number of days to search back from the current date.
chunk_size : int, optional
The size of the chunks to split the full text into
Returns
-------
pmc_articles : list of dict
A list of dictionaries containing article information
"""
pmc_ids = search_pmc(query, reldate=days)
pmc_articles = []
for pmc_id in pmc_ids:
pmc_article = get_article_info(pmc_id, chunk_size)
pmc_articles.append(pmc_article)
# Wait for 1 second to avoid overloading the server
time.sleep(1)
notify_new_articles(pmc_articles)
return pmc_articles
[docs]
def notify_new_articles(articles):
"""
Notify the user of new articles.
Parameters
----------
articles : list of dict
A list of dictionaries containing the title, authors, journal, year, URL, open access status, keywords, abstract,
methods, results, discussion, and references of the new articles.
"""
if articles:
print(f"PubMed Central has {len(articles)} new articles!")
else:
print("No new articles found.")