Source code for peptidedigest.article_db

"""
Functions for creating and managing a SQLite database for storing articles and their model responses.
"""

import sqlite3
import os

[docs] def create_database(name): """ Create a SQLite database with the given name. Parameters ---------- name : str The name of the database to create. Returns ------- None The database is created in the current working directory. """ database = name + ".db" conn = sqlite3.connect(database) c = conn.cursor() c.execute("""CREATE TABLE IF NOT EXISTS article_info ( id INTEGER PRIMARY KEY, title TEXT, authors TEXT, journal TEXT, publisher TEXT, date DATE, url TEXT UNIQUE, doi TEXT UNIQUE, keywords TEXT, scidir_pmc TEXT, pmc_id TEXT UNIQUE )""") c.execute("""CREATE TABLE IF NOT EXISTS model_responses ( id INTEGER PRIMARY KEY, doi TEXT UNIQUE, bullet_points TEXT, summary TEXT, metadata TEXT, score REAL, score_justification TEXT, FOREIGN KEY(doi) REFERENCES article_info(doi) )""") c.execute("""CREATE TABLE IF NOT EXISTS peptides ( id INTEGER PRIMARY KEY, doi TEXT, peptide TEXT, FOREIGN KEY(doi) REFERENCES article_info(doi) )""") c.execute("""CREATE TABLE IF NOT EXISTS proteins ( id INTEGER PRIMARY KEY, doi TEXT, protein TEXT, FOREIGN KEY(doi) REFERENCES article_info(doi) )""") c.execute("""CREATE TABLE IF NOT EXISTS domains ( id INTEGER PRIMARY KEY, doi TEXT, domain TEXT, FOREIGN KEY(doi) REFERENCES article_info(doi) )""") c.execute("""CREATE TABLE IF NOT EXISTS chemistry_topics ( id INTEGER PRIMARY KEY, doi TEXT, chemistry TEXT, FOREIGN KEY(doi) REFERENCES article_info(doi) )""") c.execute("""CREATE TABLE IF NOT EXISTS biology_topics ( id INTEGER PRIMARY KEY, doi TEXT, biology TEXT, FOREIGN KEY(doi) REFERENCES article_info(doi) )""") c.execute("""CREATE TABLE IF NOT EXISTS computational_methods ( id INTEGER PRIMARY KEY, doi TEXT, computational_method TEXT, FOREIGN KEY(doi) REFERENCES article_info(doi) )""") conn.commit() conn.close()
[docs] def insert_article(database, article_info, model_responses=None): """ Insert an article and its model responses into the database. Parameters ---------- database : str The name of the database to insert the article into. article_info : dict A dictionary containing the article information. model_responses : dict A dictionary containing the model responses for the article. Returns ------- None The article and model responses are inserted into the database. """ database = database + ".db" # check if the database exists if not os.path.exists(database): raise FileNotFoundError(f"Database {database} does not exist.") conn = sqlite3.connect(database) c = conn.cursor() # Insert article information c.execute( """INSERT OR IGNORE INTO article_info (title, authors, journal, publisher, date, url, doi, keywords, scidir_pmc, pmc_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", ( article_info["title"], ", ".join(article_info["authors"]), article_info["journal"], article_info["publisher"], article_info["date"], article_info["url"], article_info["doi"], ", ".join(article_info["keywords"]), article_info["scidir/pmc"], article_info["pmc_id"] ), ) # Insert model responses if model_responses is None: conn.commit() conn.close() return c.execute( """INSERT OR IGNORE INTO model_responses (doi, bullet_points, summary, metadata, score, score_justification) VALUES (?, ?, ?, ?, ?, ?)""", ( article_info["doi"], model_responses["bullet_points"], model_responses["summary"], model_responses["metadata"], model_responses["score"], model_responses["score_justification"], ), ) if model_responses["peptides"] is None: model_responses["peptides"] = [] for peptide in model_responses["peptides"]: c.execute( """INSERT OR IGNORE INTO peptides (doi, peptide) VALUES (?, ?)""", ( article_info["doi"], peptide ) ) if model_responses["proteins"] is None: model_responses["proteins"] = [] for protein in model_responses["proteins"]: c.execute( """INSERT OR IGNORE INTO proteins (doi, protein) VALUES (?, ?)""", ( article_info["doi"], protein ) ) if model_responses["domains"] is None: model_responses["domains"] = [] for domain in model_responses["domains"]: c.execute( """INSERT OR IGNORE INTO domains (doi, domain) VALUES (?, ?)""", ( article_info["doi"], domain ) ) if model_responses["chemistry"] is None: model_responses["chemistry"] = [] for topic in model_responses["chemistry"]: c.execute( """INSERT OR IGNORE INTO chemistry_topics (doi, chemistry) VALUES (?, ?)""", ( article_info["doi"], topic ) ) if model_responses["biology"] is None: model_responses["biology"] = [] for topic in model_responses["biology"]: c.execute( """INSERT OR IGNORE INTO biology_topics (doi, biology) VALUES (?, ?)""", ( article_info["doi"], topic ) ) if model_responses["computational_methods"] is None: model_responses["computational_methods"] = [] for method in model_responses["computational_methods"]: c.execute( """INSERT OR IGNORE INTO computational_methods (doi, computational_method) VALUES (?, ?)""", ( article_info["doi"], method ) ) conn.commit() conn.close()
[docs] def get_article(database, doi= None, pmc_id=None): """ Get the article information and model responses for a given DOI. Parameters ---------- database : str The name of the database to retrieve the article from. doi : str The DOI of the article to retrieve. pmc_id : str The PMC ID of the article to retrieve. Returns ------- dict A dictionary containing the article information and model responses. """ if doi is None and pmc_id is None: raise ValueError("Either DOI or PMC ID must be provided.") if doi is not None and pmc_id is not None: raise ValueError("Only one of DOI or PMC ID should be provided.") if doi is not None: if not check_article_exists(database, doi, "doi"): raise ValueError(f"Article with DOI {doi} does not exist in the database.") if pmc_id is not None: if not check_article_exists(database, pmc_id, "pmc_id"): raise ValueError(f"Article with PMC ID {pmc_id} does not exist in the database.") database = database + ".db" conn = sqlite3.connect(database) c = conn.cursor() if doi is not None: c.execute( """SELECT * FROM article_info WHERE doi = ?""", (doi,), ) article_info = c.fetchone() c.execute( """SELECT * FROM model_responses WHERE doi = ?""", (doi,), ) model_responses = c.fetchone() c.execute( """SELECT peptide FROM peptides WHERE doi = ?""", (doi,), ) peptides = c.fetchall() c.execute( """SELECT protein FROM proteins WHERE doi = ?""", (doi,), ) proteins = c.fetchall() c.execute( """SELECT domain FROM domains WHERE doi = ?""", (doi,), ) domains = c.fetchall() c.execute( """SELECT chemistry FROM chemistry_topics WHERE doi = ?""", (doi,), ) chemistry_topics = c.fetchall() c.execute( """SELECT biology FROM biology_topics WHERE doi = ?""", (doi,), ) biology_topics = c.fetchall() c.execute( """SELECT computational_method FROM computational_methods WHERE doi = ?""", (doi,), ) computational_methods = c.fetchall() conn.close() elif pmc_id is not None: c.execute( """SELECT * FROM article_info WHERE pmc_id = ?""", (pmc_id,), ) article_info = c.fetchone() c.execute( """SELECT * FROM model_responses WHERE doi = ?""", (pmc_id,), ) model_responses = c.fetchone() c.execute( """SELECT peptide FROM peptides WHERE doi = ?""", (pmc_id,), ) peptides = c.fetchall() c.execute( """SELECT protein FROM proteins WHERE doi = ?""", (pmc_id,), ) proteins = c.fetchall() c.execute( """SELECT domain FROM domains WHERE doi = ?""", (pmc_id,), ) domains = c.fetchall() c.execute( """SELECT chemistry FROM chemistry_topics WHERE doi = ?""", (pmc_id,), ) chemistry_topics = c.fetchall() c.execute( """SELECT biology FROM biology_topics WHERE doi = ?""", (pmc_id,), ) biology_topics = c.fetchall() c.execute( """SELECT computational_method FROM computational_methods WHERE doi = ?""", (pmc_id,), ) computational_methods = c.fetchall() conn.close() article = { "title": article_info[1], "authors": article_info[2], "journal": article_info[3], "publisher": article_info[4], "date": article_info[5], "url": article_info[6], "doi": article_info[7], "keywords": article_info[8], "scidir/pmc": article_info[9], "pmc_id": article_info[10], "bullet_points": model_responses[2], "summary": model_responses[3], "metadata": model_responses[4], "score": model_responses[5], "score_justification": model_responses[6], "peptides": [peptide[0] for peptide in peptides], "proteins": [protein[0] for protein in proteins], "domains": [domain[0] for domain in domains], "chemistry_topics": [chemistry[0] for chemistry in chemistry_topics], "biology_topics": [biology[0] for biology in biology_topics], "computational_methods": [method[0] for method in computational_methods], } return article
[docs] def get_articles(database): """ Get all articles from the database. Parameters ---------- database : str The name of the database to retrieve the articles from. Returns ------- list A list of dictionaries containing the article information and model responses. """ database = database + ".db" # check if the database exists if not os.path.exists(database): raise FileNotFoundError(f"Database {database} does not exist.") conn = sqlite3.connect(database) c = conn.cursor() c.execute( """SELECT * FROM article_info""", ) articles = c.fetchall() articles_list = [] for article in articles: c.execute( """SELECT * FROM model_responses WHERE doi = ?""", (article[7],), ) model_responses = c.fetchone() c.execute( """SELECT peptide FROM peptides WHERE doi = ?""", (article[7],), ) peptides = c.fetchall() c.execute( """SELECT protein FROM proteins WHERE doi = ?""", (article[7],), ) proteins = c.fetchall() c.execute( """SELECT domain FROM domains WHERE doi = ?""", (article[7],), ) domains = c.fetchall() c.execute( """SELECT chemistry FROM chemistry_topics WHERE doi = ?""", (article[7],), ) chemistry_topics = c.fetchall() c.execute( """SELECT biology FROM biology_topics WHERE doi = ?""", (article[7],), ) biology_topics = c.fetchall() c.execute( """SELECT computational_method FROM computational_methods WHERE doi = ?""", (article[7],), ) computational_methods = c.fetchall() article_dict = { "title": article[1], "authors": article[2], "journal": article[3], "publisher": article[4], "date": article[5], "url": article[6], "doi": article[7], "keywords": article[8], "scidir/pmc": article[9], "pmc_id": article[10], "bullet_points": model_responses[2], "summary": model_responses[3], "metadata": model_responses[4], "score": model_responses[5], "score_justification": model_responses[6], "peptides": [peptide[0] for peptide in peptides], "proteins": [protein[0] for protein in proteins], "domains": [domain[0] for domain in domains], "chemistry_topics": [chemistry[0] for chemistry in chemistry_topics], "biology_topics": [biology[0] for biology in biology_topics], "computational_methods": [method[0] for method in computational_methods], } articles_list.append(article_dict) conn.close() return articles_list
[docs] def update_article(database, doi, model_responses): """ Update the model responses for an article in the database. Parameters ---------- database : str The name of the database to update the article in. doi : str The DOI of the article to update. model_responses : dict A dictionary containing the updated model responses. Returns ------- None The model responses for the article are updated in the database. """ database = database + ".db" # check if the database exists if not os.path.exists(database): raise FileNotFoundError(f"Database {database} does not exist.") conn = sqlite3.connect(database) c = conn.cursor() c.execute( """UPDATE model_responses SET bullet_points = ?, summary = ?, metadata = ?, score = ?, score_justification = ? WHERE doi = ?""", ( model_responses["bullet_points"], model_responses["summary"], model_responses["metadata"], model_responses["score"], model_responses["score_justification"], doi, ), ) c.execute( """DELETE FROM peptides WHERE doi = ?""", (doi,), ) if model_responses["peptides"] is None: model_responses["peptides"] = [] for peptide in model_responses["peptides"]: c.execute( """INSERT OR IGNORE INTO peptides (doi, peptide) VALUES (?, ?)""", ( doi, peptide ) ) c.execute( """DELETE FROM proteins WHERE doi = ?""", (doi,), ) if model_responses["proteins"] is None: model_responses["proteins"] = [] for protein in model_responses["proteins"]: c.execute( """INSERT OR IGNORE INTO proteins (doi, protein) VALUES (?, ?)""", ( doi, protein ) ) c.execute( """DELETE FROM domains WHERE doi = ?""", (doi,), ) if model_responses["domains"] is None: model_responses["domains"] = [] for domain in model_responses["domains"]: c.execute( """INSERT OR IGNORE INTO domains (doi, domain) VALUES (?, ?)""", ( doi, domain ) ) c.execute( """DELETE FROM chemistry_topics WHERE doi = ?""", (doi,), ) if model_responses["chemistry"] is None: model_responses["chemistry"] = [] for chemistry in model_responses["chemistry"]: c.execute( """INSERT OR IGNORE INTO chemistry_topics (doi, chemistry) VALUES (?, ?)""", ( doi, chemistry ) ) c.execute( """DELETE FROM biology_topics WHERE doi = ?""", (doi,), ) if model_responses["biology"] is None: model_responses["biology"] = [] for biology in model_responses["biology"]: c.execute( """INSERT OR IGNORE INTO biology_topics (doi, biology) VALUES (?, ?)""", ( doi, biology ) ) c.execute( """DELETE FROM computational_methods WHERE doi = ?""", (doi,), ) if model_responses["computational_methods"] is None: model_responses["computational_methods"] = [] for method in model_responses["computational_methods"]: c.execute( """INSERT OR IGNORE INTO computational_methods (doi, computational_method) VALUES (?, ?)""", ( doi, method ) ) conn.commit() conn.close()
[docs] def check_article_exists(database, value, column): """ Check if an article with the given value in the specified column exists in the database. Parameters ---------- database : str The name of the database to check for the article. value : str The value to check for in the specified column. column : str The column to check for the value. Returns ------- bool True if the article exists, False otherwise. """ database = database + ".db" # check if the database exists if not os.path.exists(database): raise FileNotFoundError(f"Database {database} does not exist.") conn = sqlite3.connect(database) c = conn.cursor() c.execute( f"""SELECT * FROM article_info WHERE {column} = ?""", (value,), ) article = c.fetchone() conn.close() if article is not None: return True else: return False
[docs] def delete_article(database, doi=None, pmc_id=None): """ Delete an article and its model responses from the database. Parameters ---------- database : str The name of the database to delete the article from. doi : str The DOI of the article to delete. pmc_id : str The PMC ID of the article to delete. Returns ------- None The article and model responses are deleted from the database. """ database = database + ".db" # check if the database exists if not os.path.exists(database): raise FileNotFoundError(f"Database {database} does not exist.") conn = sqlite3.connect(database) c = conn.cursor() if doi is not None: c.execute( """DELETE FROM article_info WHERE doi = ?""", (doi,), ) c.execute( """DELETE FROM model_responses WHERE doi = ?""", (doi,), ) c.execute( """DELETE FROM peptides WHERE doi = ?""", (doi,), ) c.execute( """DELETE FROM proteins WHERE doi = ?""", (doi,), ) c.execute( """DELETE FROM domains WHERE doi = ?""", (doi,), ) c.execute( """DELETE FROM chemistry_topics WHERE doi = ?""", (doi,), ) c.execute( """DELETE FROM biology_topics WHERE doi = ?""", (doi,), ) c.execute( """DELETE FROM computational_methods WHERE doi = ?""", (doi,), ) if pmc_id is not None: c.execute( """DELETE FROM article_info WHERE pmc_id = ?""", (pmc_id,), ) c.execute( """DELETE FROM model_responses WHERE doi = ?""", (pmc_id,), ) c.execute( """DELETE FROM peptides WHERE doi = ?""", (pmc_id,), ) c.execute( """DELETE FROM proteins WHERE doi = ?""", (pmc_id,), ) c.execute( """DELETE FROM domains WHERE doi = ?""", (pmc_id,), ) c.execute( """DELETE FROM chemistry_topics WHERE doi = ?""", (pmc_id,), ) c.execute( """DELETE FROM biology_topics WHERE doi = ?""", (pmc_id,), ) c.execute( """DELETE FROM computational_methods WHERE doi = ?""", (pmc_id,), ) conn.commit() conn.close()