Commit b4953dc3 authored by Rafael Artinano's avatar Rafael Artinano

Refactored and documented the code

parent caafdbcc
cmake_minimum_required(VERSION 3.12)
project(nw_wrapper)
set(CMAKE_CXX_STANDARD 14)
find_package(pybind11 REQUIRED)
add_library(nw_wrapper MODULE nw_wrapper.cpp)
target_link_libraries(nw_wrapper PRIVATE pybind11::module)
set_target_properties(nw_wrapper PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}" SUFFIX "${PYTHON_MODULE_EXTENSION}")
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import pandas as pd
import time
import numpy as np
import re
globi=0
df_b=None
def substitute_or_remove_prot_id(data,archSubs,sub_rem,archivSal=None):
"""
Substitute or remove protein identifiers based on a substitution file.
Parameters:
- data: DataFrame containing protein data
- archSubs: Input file with the protein ids that must be substituted by its primary entry
- sub_rem: Operation type ('s' for substitution, 'p' for removal of protein id repeated once replaced and return of the removed ones, 'c' for class replacement,
'na' for protein_id replacement as Entry, anything else for protein_id based on desease_id replacement and repeated removal)
- archSal: (Optional) Name of the output file has default value as None
Returns:
- Modified DataFrame after performing substitution or removal operation
"""
print("inside the problem")
with open(archSubss) as prottosubs:
index=prottosubs.readline()
acept=index.split()
listtosubs={}
for i in range(0,len(acept)):
listtosubs[acept[i]]=[]
while line := prottosubs.readline():
newline=line.split()
#print(len(newline))
for i in range(0,len(newline)):
listtosubs[list(listtosubs.keys())[i]].append(newline[i].strip())
resub=1
if re.search("Primary",list(listtosubs.keys())[0]):
resub=0
print((resub+1)%2)
#print(data)
#data2=data.copy()
global globi
if(sub_rem == "s"):
data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
#datacp=data.copy()
#print(pd.concat([data2,datacp]).drop_duplicates())
elif(sub_rem == "p"):
datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==False]
data= data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True]
#print(data[data["protein_id"].isin(list(listtosubs.values())[(resub)])==True])
#print(datas)
#data.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=True)
data=data.drop_duplicates(keep="first", inplace=False)
did=data.copy()
data = data.drop_duplicates(subset=['disease_id', 'protein_sequence'], keep="first", inplace=False)
did=did[~did.isin(data).all(axis=1)]
did=did.drop_duplicates()
#print(pd.concat([did,did2]).drop_duplicates(keep=False))
print(did)
datas=pd.concat([datas, did], ignore_index=True)
if(archivSal != None):
data.to_excel(archivSal,index=False,columns=data.columns)
datas.to_csv('resultados/proteinasDescartadassp_'+ str(globi) +'.csv', index=False)
elif(sub_rem == "c"):
datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
print("tamaño original: "+str(len(data)))
dats=data.drop_duplicates(subset=['protein_id','class_id'],keep='first',inplace=False)
print("Despues de tirar duplicados en id: "+str(len(dats)))
dats=dats.drop_duplicates(subset=['protein_sequence','class_id'],keep='first',inplace=False)
print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
if(archivSal != None):
dats.to_excel(archivSal,index=False,columns=data.columns)
datas.to_csv('resultados/clasesDescartadasc_'+ str(globi) +'.csv', index=False)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx')
globi=globi+1
data=dats
elif(sub_rem=="na"):
datas= data[data["Entry"].isin(list(listtosubs.values())[(resub+1)%2])==True]
data["Entry"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
print("tamaño original: "+str(len(data)))
dats=data.drop_duplicates(subset=['Entry'],keep='first',inplace=False)
print("Despues de tirar duplicados en id: "+str(len(dats)))
#dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False)
print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
if(archivSal != None):
dats.to_excel(archivSal,index=False,columns=data.columns)
datas.to_csv('resultados/proteinasDescartadasna_'+ str(globi) +'.csv', index=False)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx')
globi=globi+1
data=dats
else:
datas= data[data["protein_id"].isin(list(listtosubs.values())[(resub+1)%2])==True]
data["protein_id"].replace(list(listtosubs.values())[(resub+1)%2], list(listtosubs.values())[resub])
print("tamaño original: "+str(len(data)))
dats=data.drop_duplicates(subset=['disease_id','protein_id'],keep='first',inplace=False)
print("Despues de tirar duplicados en id: "+str(len(dats)))
dats=dats.drop_duplicates(subset=['disease_id','protein_sequence'],keep='first',inplace=False)
print("Despues de tirar duplicados en secuencia: "+str(len(dats)))
if(archivSal != None):
dats.to_excel(archivSal,index=False,columns=data.columns)
datas.to_csv('resultados/proteinasDescartadas_'+ str(globi) +'.csv', index=False)
#pd_diff=pd.concat([data,dats]).drop_duplicates(keep=False)
#pd_diff.to_excel('data_not_valid.xlsx')
globi=globi+1
data=dats
#data.to_excel('data_nervous_genes_2.xlsx')
return data
def readData(archivoEntrada,archivoEnt2 ,enfermedad,archivoSal):
"""
Read data from an Excel file, find evey entry that matches the patterns of the CSV Input file, and save the result of the matches with the pattern that was searched to a new Excel file.
Parameters:
- archivoEntrada: Input Excel file path
- archivoEnt2: Input CSV file with the patterns to be searched in the Archivo Entrada input file
- enfermedad: Optional disease ID for filtering
- archivoSal: Output Excel file path
"""
data = pd.read_excel(archivoEntrada)
#data.to_excel('data_nervous_genes_2.xlsx')
if (enfermedad != ''):
#datar=substitute_or_remove_prot_id(data,"r")
#sprint("numero de filas de proteinas descartadas totales principal : "+ str(len(data)-len(datar)))
data = data.loc[data["disease_id"] == enfermedad]
dataB = pd.read_csv(archivoEnt2)
print(len(data))
#dataB.to_excel("data_nervous_genes_xf2.xlsx")
#data.to_excel('data_nervous_genes_2.xlsx')
filt_data=len(data)
alz_filt_data=len(dataB)
print("proteinas descartadas post filtro, principal: " + str(filt_data-len(data)))
print("proteinas descartadas post filtro, comun alz: " + str(alz_filt_data-len(dataB)))
#print("tamaño del descarte: "+ str(data[data["protein_id"].isin(dataB["protein_id"])].shape[0]))
dataC={}
daa=dataB["Patron"].unique()
for u in daa:
if(len(u)>3):
dataC[u]=data[data.protein_sequence.str.contains(u)]["protein_id"].to_list()
dataG=pd.DataFrame(dataC.items(),columns=["pattern","proteins"])
dataG.to_excel(archivoSal)
#data=substitute_or_remove_prot_id(data,"r")
def add_name_patterns(archivoEntrada,archivNom,EqvData,OutName):
"""
Add protein names to the DataFrame extracted from an excel based on a CSV file and save the result to a new CSV file.
Parameters:
- archivoEntrada: Input Excel file
-ArchivNom: Input csv with the names of equivalences of the values contained in the input excel
"""
data=pd.read_excel(archivoEntrada)
dataB=pd.read_csv(archivNom,usecols=['Entry',"Entry_Name","Protein_names","Length"])
dataB=substitute_or_remove_prot_id(dataB,EqvData,"na",OutName)
dataB=dataB.reindex(columns=['Entry',"Entry_Name","Length","Protein_names"])
datas=dataB[dataB["Entry"].isin(data["protein_id"])]
datas.to_csv(archivoEntrada+"_nombre.csv")
doo=data[~(data["protein_id"].isin(dataB["Entry_name"]))]
doo.to_csv("Proteinas_sin_nombre.csv")
#data.assign(lenght=datas["Length"].to_list())
#data.assign(name=datas["Protein names"].to_list())
#data.to_csv(archivoEntrada+"_nombre.csv")
if __name__=="__main__":
#readData("data_nervous_genes_xf.xlsx","resultados/patronesIdenticosTreat_005.csv","C0007131","ProtByPatternLung005.xlsx")
add_name_patterns("data_nervous_genes_xf.xlsx","protein_name.csv","nombres_sust.txt","protein_name_clean.csv")
This diff is collapsed.
Hydrophobic I L V C A G M F Y W H K T
Small V C A G D N S T P
Aromatic F Y W H
Positive H K R
Negative E D
Tiny A G S
Aliphatic I L V
Aromatic F Y W H
Positive H K R
Negative E D
......@@ -53,11 +53,11 @@ def metrica_distanciaProteinas():
df.to_csv('resultados/Metrica_distanciaProteinasMismoPatron.csv',
index=False)
def patronesComun(patronesComun):
def patronesComun(patronesComun,archivoEntrada,ocurrencia,archivoClases):
# Leer el archivo CSV y cargar los datos en una lista de diccionarios
registros = []
cl=pd.read_excel("alzheimer_protein_class 2.xlsx")
cl=pd.read_excel(archivoClases)
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli=cl.groupby('protein_id')
......@@ -69,7 +69,7 @@ def patronesComun(patronesComun):
do[k]=di
di=[]
class_dict=do
with open("resultados/patronesIdenticos.csv", 'r') as file:
with open("resultados/patronesIdenticos"+str(int((float(ocurrencia)%1)*100))+".csv", 'r') as file:
reader = csv.DictReader(file)
for row in reader:
registros.append(row)
......@@ -96,7 +96,7 @@ def patronesComun(patronesComun):
# Diccionario para almacenar las proteinas que tienen en común cada par de proteinas
proteinas_comunes = {}
rr=[]
df_p = pd.read_excel("data_nervous_genes_xf.xlsx")
df_p = pd.read_excel(archivoEntrada)
#df_p = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_p=substitute_or_remove_prot_id(df_p,"r")
proteinas_dict2 = dict(df_p[['protein_id','protein_sequence']].values)
......@@ -182,11 +182,11 @@ def patronesComun(patronesComun):
df = pd.DataFrame(output, columns=['Patrones', 'Proteina1', 'Proteina2',"classesProt1","classesProt2"])
output=[]
if(first2):
df.to_csv('resultados/Metrica_patronesComunes.csv',
df.to_csv('resultados/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+'.csv',
index=False)
first2=False
else:
df.to_csv('resultados/Metrica_patronesComunes.csv',index=False,header=False,mode='a')
df.to_csv('resultados/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False,header=False,mode='a')
#else:
#output.append([sorted_pattern_lengths, proteina1, proteina2,
......@@ -199,10 +199,10 @@ def patronesComun(patronesComun):
df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia',"classesProt1","classesProt2"])
output2=[]
if(first):
df2.to_csv('resultados/Metrica_Coincidencia.csv',index=False)
df2.to_csv('resultados/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False)
first=False
else:
df2.to_csv('resultados/Metrica_Coincidencia.csv',index=False,header=False,mode='a')
df2.to_csv('resultados/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False,header=False,mode='a')
......@@ -213,11 +213,11 @@ def patronesComun(patronesComun):
def patronesComunClas(patronesComun,name):
def patronesComunClas(patronesComun,name,archivoEntrada,ocurrencia,archivoClases):
# Leer el archivo CSV y cargar los datos en una lista de diccionarios
registros = []
cl=pd.read_excel("alzheimer_protein_class 2.xlsx")
cl=pd.read_excel(archivoClases)
#cl=substitute_or_remove_prot_id(cl,"r")
#data2=data.copy()
cli=cl.groupby('protein_id')
......@@ -229,7 +229,7 @@ def patronesComunClas(patronesComun,name):
do[k]=di
di=[]
class_dict=do
with open("clases/"+name+"/patronesIdenticos.csv", 'r') as file:
with open("clases/"+name+"/patronesIdenticos"+str(int((float(ocurrencia)%1)*100))+".csv", 'r') as file:
reader = csv.DictReader(file)
for row in reader:
registros.append(row)
......@@ -256,7 +256,7 @@ def patronesComunClas(patronesComun,name):
# Diccionario para almacenar las proteinas que tienen en común cada par de proteinas
proteinas_comunes = {}
rr=[]
df_p = pd.read_excel("data_nervous_genes_xf.xlsx")
df_p = pd.read_excel(archivoEntrada)
#df_p = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_p=substitute_or_remove_prot_id(df_p,"r")
proteinas_dict2 = dict(df_p[['protein_id','protein_sequence']].values)
......@@ -342,11 +342,11 @@ def patronesComunClas(patronesComun,name):
df = pd.DataFrame(output, columns=['Patrones', 'Proteina1', 'Proteina2',"classesProt1","classesProt2"])
output=[]
if(first2):
df.to_csv('clases/'+name+'/Metrica_patronesComunes.csv',
df.to_csv('clases/'+name+'/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+'.csv',
index=False)
first2=False
else:
df.to_csv('clases/'+name+'/Metrica_patronesComunes.csv',index=False,header=False,mode='a')
df.to_csv('clases/'+name+'/Metrica_patronesComunes'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False,header=False,mode='a')
#else:
#output.append([sorted_pattern_lengths, proteina1, proteina2,
......@@ -359,10 +359,10 @@ def patronesComunClas(patronesComun,name):
df2=pd.DataFrame(output2,columns=['proteina1','proteina2','%Coincidencia',"classesProt1","classesProt2"])
output2=[]
if(first):
df2.to_csv('clases/'+name+'/Metrica_Coincidencia.csv',index=False)
df2.to_csv('clases/'+name+'/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False)
first=False
else:
df2.to_csv('clases/'+name+'/Metrica_Coincidencia.csv',index=False,header=False,mode='a')
df2.to_csv('clases/'+name+'/Metrica_Coincidencia'+str(int((float(ocurrencia)%1)*100))+'.csv',index=False,header=False,mode='a')
......@@ -372,8 +372,8 @@ def patronesComunClas(patronesComun,name):
# index=False)
def remplazar_sequence_for_ID(output):
df_b = pd.read_excel("data_nervous_genes_xf.xlsx")
def remplazar_sequence_for_ID(output,archivoEntrada):
df_b = pd.read_excel(archivoEntrada)
#df_b = pd.read_excel("proteinasClase_PC00060.xlsx")
#df_b=substitute_or_remove_prot_id(df_b,"r")
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
......
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/numpy.h>
#include <iostream>
#include <string>
#include <vector>
#include <map>
#define alphabets 26
#define SHIFTBITS 31
using namespace std;
#ifndef _LOGGER_HPP_
#define _LOGGER_HPP_
#include <iostream>
#include <sstream>
/* consider adding boost thread id since we'll want to know whose writting and
* won't want to repeat it for every single call */
/* consider adding policy class to allow users to redirect logging to specific
* files via the command line
*/
enum loglevel_e
{logERROR, logWARNING, logINFO, logDEBUG, logDEBUG1, logDEBUG2, logDEBUG3, logDEBUG4};
class logIt
{
public:
logIt(loglevel_e _loglevel = logERROR) {
_buffer << _loglevel << " :"
<< std::string(
_loglevel > logDEBUG
? (_loglevel - logDEBUG) * 4
: 1
, ' ');
}
template <typename T>
logIt & operator<<(T const & value)
{
_buffer << value;
return *this;
}
~logIt()
{
_buffer << std::endl;
// This is atomic according to the POSIX standard
// http://www.gnu.org/s/libc/manual/html_node/Streams-and-Threads.html
std::cerr << _buffer.str();
}
private:
std::ostringstream _buffer;
};
extern loglevel_e loglevel;
#define log(level) \
if (level > loglevel) ; \
else logIt(level)
#endif
class NW{
private:
//const size_t alphabets = 26;
size_t score=0;
int align(const std::string &aa, const std::string &bb, int alpha_gap,int match,int mismatch,
int alpha[alphabets][alphabets], std::string &a_aligned,
std::string &b_aligned)
{
size_t n = aa.size();
size_t mm = bb.size();
cout << n << " tamaño de la prot1\n";
cout << mm << "tamaño de la prot2\n";
long int nRows = mm + 1;
long int nCols = n + 1;
int a, b, c, m; // temps for max calculation
// populate first row, column
// s1 across top, s2 down side
vector<int> t;
t.reserve(nRows*nCols+2);
t[0] = 0;
// populate first row and column
for (long int i = 1; i < nCols; ++i) { t[i] = t[i-1] + alpha_gap; }
for (long int i = 1; i < nRows; ++i) { t[nCols*i] = t[nCols*(i-1)] + alpha_gap; }
cout << "tamaño de t " << (nRows*nCols)+2 << "\n";
// populate remaining table, row-major order
for (long int i = 1; i < nRows; ++i) {
for (long int j = 1; j < nCols; ++j) {
m = -(aa[j-1] == bb[i-1]);
//a = t[((i-1) * nCols) + j-1] + ((m? match:alpha[aa[j-1]][bb[i-1]]));
a = t[((i-1) * nCols) + j-1] + ((alpha[aa[j-1]-'A'][bb[i-1]-'A']));
b = t[((i-1) * nCols) + j] + alpha_gap;
c = t[(i * nCols) + j-1] + alpha_gap;
// OPTIMIZATION: bitwise max
a = a - (((a - b) >> SHIFTBITS) & (a - b));
a = a - (((a - c) >> SHIFTBITS) & (a - c));
t[(i * nCols) + j] = a;
}
}
cout << "aqui muere?" << "\n";
// printTable(t, nRows, nCols);
//for (int i = nRows*nCols+1; i >= 0; i--)
//cout << t[(nRows)*(nCols-1)] << "\n";
//cout << t[(nRows*nCols)-1] << "\n";
return t[(nRows*nCols)-1];
}
void print2DVector(const vector<vector<int> > &A)
{
for (auto& i : A)
{
for (auto j : i)
cout << j << "\n";
cout << endl;
}
}
/*
* Returns the Needleman-Wunsch score for the best alignment of a and b
* and stores the aligned sequences in a_aligned and b_aligned
*/
public:
NW(std::string a1, std::string b1,std::map<char,std::map<char,float>> c, int match, int mismatch, int gap)
{
// The input strings that need to be aligned
// Penalty for any alphabet letter matched with a gap
int gap_penalty = gap;
/*
* alpha[i][j] = penalty for matching the ith alphabet with the
* jth alphabet.
* Here: Penalty for matching an alphabet with anoter one is 1
* Penalty for matching an alphabet with itself is 0
*/
//cout << match << " + "<< mismatch << "\n";
int alpha[alphabets][alphabets];
for (size_t i = 0; i < alphabets; ++i)
{
for (size_t j = 0; j < alphabets; ++j)
{
//is first set as mismatch and continues like that if a match is not found
alpha[i][j]=mismatch;
//if alphabet letter is equal the other alphabet letter match
if (i == j) alpha[i][j] = match;
//tries to set as match alphabet letters that belong to the same group
else{if(c.find((char)'A'+i) != c.end() || c.find((char)'A'+j) != c.end() ){
float count=0;
if(c.find((char)'A'+i) != c.end()){
if(c[(char)'A'+i].find((char)'A'+j) != c[(char)'A'+i].end() )
count= float(c[(char)'A'+i][(char)'A'+j]);
//cout << float(c[(char)'A'+i][(char)'A'+j]);
}
else{
if(c[(char)'A'+j].find((char)'A'+i) != c[(char)'A'+j].end())
count= float(c[(char)'A'+j][(char)'A'+i]);}
if(count !=0){
alpha[i][j]= match;
alpha[j][i]= match;}
}
}
}
}
// Aligned sequences
std::string a2, b2;
cout << "aqui se llega" << "\n";
int penalty = align(a1, b1, gap_penalty,match,mismatch, alpha, a2, b2);
this->score = penalty;
//delete alpha;
}
int get_score(){
return this->score;
}
};
namespace py = pybind11;
PYBIND11_MODULE(nw_wrapper, m) {
py::class_<NW>(m, "NW")
.def(py::init<std::string, std::string, std::map<char, std::map<char, float>>, int, int, int>())
.def("get_score", &NW::get_score);
}
{
"NombreArchivoEntrada" : "Data/data_nervous_genes_xf.xlsx",
"CodigoEnfermedad":"C0007131",
"NombreArchivoTarget":"",
"NombreArchivoClases":"lung_cancer_protein_class.xlsx",
"NombreArchivoAA": "Data/aminoacidos_mod_2.txt",
"Similitud":"0.00001",
"OcurrenciaMin":"0.05",
"Metrica":"0",
"NWMatrix":[""],
"NWmatch":"3",
"NWmismatch":"0",
"NWgap":"-1"
}
This diff is collapsed.
......@@ -3,6 +3,15 @@ import Levenshtein
import math
def patrones_similares(pattern_freqMin):
"""
Identifies similar patterns in a dictionary of patterns based on Levenshtein distance.
Parameters:
- pattern_freqMin: dict, dictionary of patterns and their positions.
Returns:
- pattern_freqMin: dict, updated dictionary of patterns with similar patterns merged.
"""
similar_patterns = {} # Guarda los patrones similares relacionados con el patron similar del que parten
num_op = 3
similar_patterns = {} # Guarda los patrones similares relacionados con el patron similar del que parten
......@@ -78,4 +87,4 @@ if __name__ == "__main__":
fin = time.time()
tiempo_total = fin - inicio
print(tiempo_total, "segundos")
\ No newline at end of file
print(tiempo_total, "segundos")
......@@ -16,6 +16,15 @@ from collections import defaultdict
classes={}
min_ocurrence=0
def swap_dict(d):
"""
Swaps keys and values in a dictionary.
Parameters:
- d: dict, input dictionary.
Returns:
- new_dict: dict, dictionary with keys and values swapped.
"""
new_dict = {}
for key, values in d.items():
for value in values:
......@@ -23,47 +32,23 @@ def swap_dict(d):
new_dict[value] = []
new_dict[value].append(key)
return new_dict
def readData(archivoEntrada, enfermedad, archivoTarget):
data = pd.read_excel(archivoEntrada)
dataC = pd.read_csv("resultados/proteinasDescartadas2.csv")
#data=substitute_or_remove_prot_id(data,"r")
#dataC=substitute_or_remove_prot_id(dataC,"r")
#Descarte de proteinas
data = data[~data['protein_id'].isin(dataC['ProteinasDescartadas'])]
print("Se ha realizado el descarte de proteínas")
cla={}
global classes
with open('aminoacidos.txt','r') as op:
line=op.readline()
print(line)
oo=line.split()
key=oo.pop(0)
cla[key]=oo
classes=swap_dict(cla)
# "C0002395"
if(enfermedad != ''):
data = data.loc[data["disease_id"] == enfermedad]
# dataB = pd.read_excel("proteinas_en_comun_Alzheimer.xlsx")
# print("Se han seleccionado las proteínas de la enfermedad elegida")
# dataB=substitute_or_remove_prot_id(dataB,"r")
#if(archivoTarget != ''):
# dataB=substitute_or_remove_prot_id(dataB,"r")
#Eliminar las proteinas target
# data = data[~((data["disease_id"] == enfermedad) &
# (data["protein_id"].isin(dataB["protein_id"])))]
# print("Se han descartado las proteínas del archivo target")
sequences = data["protein_sequence"]
print(sequences)
num_filas = sequences.shape[0]
return sequences, num_filas
def read_aminoacidos(archivoAA):
"""
Reads amino acid information from a file and returns a dictionary with
swapped keys and values, and the original dictionary.
Parameters:
- archivoAA: str, path to the amino acid information file.
def read_aminoacidos():
Returns:
- classes: dict, dictionary with swapped keys and values.
- cla: dict, original dictionary with amino acid information.
"""
cla = {}
with open('aminoacidos.txt', 'r') as op:
with open(archivoAA, 'r') as op:
lines = op.readlines()
for line in lines:
oo = line.replace('\n', '').split('\t')
......@@ -71,10 +56,25 @@ def read_aminoacidos():
cla[key] = oo
return swap_dict(cla), cla
def guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence):
def guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence,archivoAA):
"""
Processes protein sequences to find patterns of length 1 and their positions,
filters patterns based on minimum occurrence, and saves results to a CSV file.
Parameters:
- sequences: pandas Series, protein sequences.
- pattern_freqMin: dict, dictionary to store patterns and their occurrences.
- min_ocurrence: int, minimum occurrence threshold.
- archivoAA: str, path to the amino acid information file.
Returns:
- pattern_freqMin: dict, updated dictionary of patterns.
- posicion_patterns: dict, positions of each character in the sequences.
- longitud_max: int, maximum length of protein sequences.
"""
all_patterns = defaultdict(list)
longitud_max = 0
classes, cla = read_aminoacidos()
classes, cla = read_aminoacidos(archivoAA)
for protein in sequences:
longitud = len(protein)
......@@ -111,11 +111,24 @@ def guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence):
df.to_csv('prueba2.csv', index=False)
return pattern_freqMin, posicion_patterns, longitud_max
def buscar_patrones_simAA(sequences, min_ocurr):
def buscar_patrones_simAA(sequences, min_ocurr,archivoAA):
"""
Searches for similar patterns in protein sequences based on amino acid information,
filters patterns based on minimum occurrence, and returns results.
Parameters:
- sequences: pandas Series, protein sequences.
- min_ocurr: int, minimum occurrence threshold.
- archivoAA: str, path to the amino acid information file.
Returns:
- pattern_freqMin: dict, dictionary of patterns and their positions.
- num_patrones: int, number of unique patterns found.
"""
min_ocurrence = min_ocurr
pattern_freqMin = {}
pattern_freqMin, posicion_patterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence)
classes, cla = read_aminoacidos()
pattern_freqMin, posicion_patterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin, min_ocurrence,archivoAA)
classes, cla = read_aminoacidos(archivoAA)
if not bool(pattern_freqMin):
return pattern_freqMin, 0
......@@ -212,13 +225,26 @@ def buscar_patrones_simAA(sequences, min_ocurr):
num_patrones = df.shape[0]
#pattern_freqMin = {k: v for k, v in pattern_freqMin.items() if len(k) >= 4}
return pattern_freqMin, num_patrones
def buscar_patrones_identicos(sequences,min_ocurr):
def buscar_patrones_identicos(sequences,min_ocurr,archivoAA):
"""
Searches for identical patterns of different lengths in protein sequences
based on amino acid information, and returns results.
Parameters:
- sequences: pandas Series, protein sequences.
- min_ocurr: int, minimum occurrence threshold.
- archivoAA: str, path to the amino acid information file.
Returns:
- pattern_freqMin: dict, dictionary of patterns and their positions.
- num_patrones: int, number of unique patterns found.
"""
pattern_freqMin = {}
min_ocurrence=min_ocurr
pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin)
pattern_freqMin, posicionPatterns, longitud_max = guardar_patrones_len1(sequences, pattern_freqMin,archivoAA)
cla={}
num_patrones=0
with open('aminoacidos.txt','r') as op:
with open(archivoAA,'r') as op:
lines=op.readlines()
print(lines)
for line in lines:
......@@ -325,13 +351,23 @@ def buscar_patrones_identicos(sequences,min_ocurr):
return pattern_freqMin, num_patrones
def remplazar_sequence_for_ID(pattern_freqMin):
df_b = pd.read_excel("data_nervous_genes_xf.xlsx")
def remplazar_sequence_for_ID(pattern_freqMin,archivoEntrada,ArchivoAA,ocurrencia):
"""
Replaces identified patterns in the original data with their corresponding IDs,
saves the results to a CSV file, and prints a success message.
Parameters:
- pattern_freqMin: dict, dictionary of patterns and their positions.
- archivoEntrada: str, path to the input Excel file.
- ArchivoAA: str, path to the amino acid information file.
- ocurrencia: float, occurrence parameter (not currently in use).
"""
df_b = pd.read_excel(archivoEntrada)
#df_b=substitute_or_remove_prot_id(df_b,'r')
output = []
global classes
cla={}
with open('aminoacidos.txt','r') as op:
with open(ArchivoAA,'r') as op:
lines=op.readlines()
#print(lines)
for line in lines:
......@@ -378,5 +414,5 @@ def remplazar_sequence_for_ID(pattern_freqMin):
df_a = pd.DataFrame(output_ordered, columns=['Patron', 'Proteina', 'Posiciones'])
# Guardar el DataFrame actualizado en un archivo CSV
df_a.to_csv('resultados/patronesSimilaresAA.csv', index=False)
df_a.to_csv('resultados/patronesSimilaresAA'+str(int((float(ocurrencia)%1)*100))+'.csv', index=False)
print("Se ha generado el .csv con los patrones idénticos encontrados")
......@@ -2,14 +2,57 @@ import pandas as pd
import Levenshtein
from minineedle import needle, smith, core
from descarteProteinas import substitute_or_remove_prot_id
from ast import literal_eval
def readData(archivoEntrada):
"""
Read protein sequences from an Excel file.
Parameters:
- archivoEntrada: Input Excel file path
Returns:
- List of protein sequences
This function reads protein sequences from an Excel file specified by 'archivoEntrada' and extracts the
'protein_sequence' column from the DataFrame. The sequences are returned as a list.
Example:
>>> sequences = readData("protein_data.xlsx")
>>> print(sequences)
['MTCG...', 'MCTA...', ...]
"""
data = pd.read_excel(archivoEntrada)
data=substitute_or_remove_prot_id(data,'r')
#data=substitute_or_remove_prot_id(data,'r')
sequences = data["protein_sequence"]
return sequences
def similitudProteinas(sequences):
"""
Calculate pairwise similarity scores between protein sequences using Levenshtein distance.
Parameters:
- sequences: List of protein sequences
Returns:
- List of lists containing pairwise similarity information:
- [protein_sequence_1, protein_sequence_2, similarity_score]
This function takes a list of protein sequences and calculates pairwise similarity scores
between each pair of protein sequences using Levenshtein distance. The results are returned
in a list of lists.
Example:
>>> sequences = ["MACG", "MACC", "MGCA"]
>>> result = similitudProteinas(sequences)
>>> print(result)
[['MACG', 'MACC', 75.0],
['MACG', 'MGCA', 50.0],
['MACC', 'MACG', 75.0],
['MACC', 'MGCA', 66.67],
['MGCA', 'MACG', 50.0],
['MGCA', 'MACC', 66.67]]
"""
output = []
for row1 in sequences:
for row2 in sequences:
......@@ -20,15 +63,59 @@ def similitudProteinas(sequences):
output.append([row1, row2, similarity*100])
return output
def remplazar_sequence_for_ID(output):
df_b = pd.read_excel("data_nervous_genes_1.xlsx")
df_b=substitute_or_remove_prot_id(df_b,"r")
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
def remplazar_sequence_for_ID(output,archivoEntrada,Sal,mode="default"):
"""
Replace protein sequences with protein IDs using a pre-existing DataFrame.
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
Parameters:
- output: List of lists containing similarity information
- mode: Replacement mode (default or drug)
- archivoEntrada: Path to protein information file
- Sal: Extension for output file
This function takes a list of lists containing pairwise similarity information, and replaces
protein sequences with their corresponding protein IDs. The replacement is based on the information
provided in a pre-existing DataFrame. The updated information is saved to a CSV file.
for item in output_ordered:
Example:
>>> data = [['MACG', 'MGCA', 75.0], ['MACC', 'MGCA', 66.67]]
>>> inputFile = "protein_data.xlsx"
>>> outputExt = "protein"
>>> remplazar_sequence_for_ID(data,inputFile,OutputExt, mode="default")
"""
df_b = pd.read_excel(archivoEntrada)
#df_b=substitute_or_remove_prot_id(df_b,"r")
# Ordenar de mayor a menor tamaño. Las subcadenas del mismo tamaño se ordenan por orden alfabetico
#output_ordered = sorted(output, key=lambda x: (-len(x[0]), x[0]))
proteinas_dict = dict(df_b[['protein_sequence', 'protein_id']].values)
if(mode=="drug"):
drug_dict=dict(df_b[['protein_sequence','drug_id']].values)
for item in output:
protein_sequence1 = item[0]
protein_sequence2 = item[1]
res=[]
[res.append(x) for x in literal_eval(drug_dict[item[0]]) if x not in res and ( x != '[' or x != ']') ]
if(len(res) == 1):
item.append(res[0])
elif(len(res)>1):
item.append(res)
else:
item.append("")
res=[]
[res.append(x) for x in literal_eval(drug_dict[item[1]]) if x not in res and ( x != '[' or x != ']')]
if(len(res) == 1):
item.append(res[0])
elif(len(res)>1):
item.append(res)
else:
item.append("")
if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
item[0] = proteinas_dict[protein_sequence1]
item[1] = proteinas_dict[protein_sequence2]
df_a=pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2','drug_id_p1','drug_id_p2'])
else:
for item in output:
protein_sequence1 = item[0]
protein_sequence2 = item[1]
if protein_sequence1 in proteinas_dict and protein_sequence2 in proteinas_dict:
......@@ -37,15 +124,66 @@ def remplazar_sequence_for_ID(output):
df_a = pd.DataFrame(output_ordered, columns=['Proteina1', 'Proteina2', 'Similaridad'])
df_a = pd.DataFrame(output, columns=['Proteina1', 'Proteina2', 'Similaridad','SimilaridadAA','similaridadAA_2'])
# Guardar el DataFrame actualizado en un archivo CSV
df_a.to_csv('AllProteins_%Similitud.csv', index=False)
df_a.to_csv('AllProteins_%Similitud'+Sal+'.csv', index=False)
def similitudMatProteinas(sequences, matrix,matrix2,matriz3):
"""
Create percentages of pairwise similarity scores between protein sequences based on three similarity matrices.
Parameters:
- sequences: List of protein sequences
- matrix: First similarity matrix
- matrix2: Second similarity matrix
- matriz3: Third similarity matrix
Returns:
- List of lists containing pairwise similarity information:
- [protein_sequence_1, protein_sequence_2, similarity_score_matrix1, similarity_score_matrix2, similarity_score_matrix3]
This function takes a list of protein sequences and three similarity matrices and calculates pairwise similarity scores
between each pair of protein sequences. The similarity scores are computed using the provided matrices, and the results
are returned in a list of lists.
Note: The function assumes that the matrices are square matrices with dimensions matching the length of the 'sequences' list.
Example:
>>> sequences = ["MACG", "MACC", "MGCA"]
>>> matrix1 = [[1.0, 0.8, 0.6], [0.8, 1.0, 0.7], [0.6, 0.7, 1.0]]
>>> matrix2 = [[0.9, 0.7, 0.5], [0.7, 0.9, 0.6], [0.5, 0.6, 0.9]]
>>> matrix3 = [[0.8, 0.6, 0.4], [0.6, 0.8, 0.5], [0.4, 0.5, 0.8]]
>>> result = similitudMatProteinas(sequences, matrix1, matrix2, matrix3)
>>> print(result)
[['MACG', 'MACC', 80.0, 70.0, 60.0],
['MACG', 'MGCA', 60.0, 50.0, 40.0],
['MACC', 'MACG', 80.0, 70.0, 60.0],
['MACC', 'MGCA', 70.0, 60.0, 50.0],
['MGCA', 'MACG', 60.0, 50.0, 40.0],
['MGCA', 'MACC', 70.0, 60.0, 50.0]]
"""
output = []
for row1 in range(0,len(sequences)):
for row2 in range(0,len(sequences)):
if row1 != row2:
#similarity = abs(smith.SmithWaterman(row1, row2).get_score()-1) / max(len(row1), len(row2))
#similarity = abs(needle.NeedlemanWunsch(row1, row2).get_score()-1) / (2*max(len(row1), len(row2)))
output.append([sequences[row1], sequences[row2], matrix[row1][row2]*100,matrix2[row1][row2]*100,matriz3[row1][row2]*100])
return output
if __name__ == "__main__":
archivoEntrada = "data_nervous_genes_1.xlsx"
archivoEntrada = "Data/data_nervous_genes_xf.xlsx"
sequences = readData(archivoEntrada)
output = similitudProteinas(sequences)
remplazar_sequence_for_ID(output)
matrix=pd.read_csv('matrizNWc.csv',header=None,index_col=False)*3+1
matrix.abs()
matrix=matrix/4
matrix2=pd.read_csv('matrizNWmod1.csv',header=None,index_col=False)*3+1.0
matrix2.abs()
matrix2=matrix2/4
matrix3=pd.read_csv('matrizNWmod2.csv',header=None,index_col=False)*3+1.0
matrix3.abs()
matrix3=matrix3/4
#output = similitudProteinas(sequences)
output=similitudMatProteinas(sequences, matrix,matrix2,matrix3)
print("Generada la tabla de con las matrices de similaridad especificadas")
remplazar_sequence_for_ID(output,archivoEntrada,"Desease")
import pandas as pd
import time
import numpy as np
import re
from ast import literal_eval
def readData(archivoEntrada, enfermedad,patrones_file,Sal):
"""
Reads data from an Excel file, filters it based on the disease, and performs additional processing.
Parameters:
- archivo_entrada (str): Path to the Excel file.
- enfermedad (str): Disease ID for filtering.
- patrones_file (str): Path to the file containing patterns.
- Sal: Output file extension
Returns:
- data (pd.DataFrame): Processed DataFrame based on the given parameters.
"""
data = pd.read_excel(archivoEntrada)
if enfermedad:
data = data.loc[data["disease_id"] == enfermedad]
dataB = pd.read_csv(patrones_file)
print(len(data))
filt_data = len(data)
alz_filt_data = len(dataB)
print("Proteins discarded after the main filter: " + str(filt_data - len(data)))
print("Proteins discarded after the common Alzheimer's filter: " + str(alz_filt_data - len(dataB)))
dataC = {}
daa = dataB["Patron"].unique()
das={}
pos={}
for u in daa:
if len(u) > 3:
kk=data.protein_sequence.str.contains(u)
das[u] = data[kk]["protein_id"].to_list()
pos[u]= data[kk]['protein_sequence'].str.find(u).to_list()
print(len(pos[u]))
print(len(das[u]))
dataC[u]=[[das[u][ii],pos[u][ii]] for ii in range(0,len(das[u]))]
dataG = pd.DataFrame(dataC.items(), columns=["pattern", "proteins"])
dataG.to_excel("ProtByPattern"+Sal+".xlsx")
sequences = data["protein_sequence"]
return data
def add_protein_info_to_data(main_data_path, patterns_info_path, protein_names_path):
"""
Add protein names and protein information from the original pattern file and the names Dataset to a DataFrame based on matching patterns.
Parameters:
- main_data_path (str): The path to the Excel file containing the main data.
- patterns_info_path (str): The path to the CSV file containing patterns and protein information.
- protein_names_path (str): The path to the CSV file containing protein names.
Returns:
None: The function updates the provided Excel file with additional protein information.
Example:
```python
add_protein_info_to_data("main_data.xlsx", "patterns_info.csv", "protein_names.csv")
```
Note:
- The function assumes that the provided Excel file ('main_data_path') contains a 'pattern' column.
- The 'patterns_info_path' CSV file is expected to have columns 'Patron', 'Proteina', and 'Posiciones'.
- The 'protein_names_path' CSV file is expected to have columns 'Entry' and 'Entry_Name'.
"""
# Read data from files
main_data = pd.read_excel(main_data_path)
patterns_info = pd.read_csv(patterns_info_path)
protein_names = pd.read_csv(protein_names_path)
# Group patterns in 'patterns_info' DataFrame
patterns_grouped = patterns_info.groupby("Patron")
# Initialize columns in 'main_data' DataFrame
main_data["protein_names"] = ""
main_data["proteins_treat"] = ""
# Iterate over patterns in 'patterns_info'
for index, row in patterns_info.iterrows():
pattern = row["Patron"]
protein_id = row["Proteina"]
positions = row["Posiciones"]
# Find matching rows in 'main_data' DataFrame
matching_rows = main_data[main_data["pattern"] == pattern]
# Update 'proteins_treat' column
main_data.loc[main_data["pattern"] == pattern, "proteins_treat"] = matching_rows["proteins"].apply(
lambda x: literal_eval(x) + [[protein_id, positions]] if pd.notna(x) else [[protein_id, positions]]
)
# Update 'protein_names' column
main_data.loc[main_data["pattern"] == pattern, "protein_names"] = matching_rows["proteins"].apply(
lambda lst: [protein_names[protein_names["Entry"] == protein_id]["Entry_Name"].to_list() if protein_id else "N/A" for protein_id, _ in literal_eval(lst)]
)
# Save the updated data
main_data_base_name = main_data_path.split(".")[0]
main_data.to_excel(f"{main_data_base_name}_aux.xlsx", index=False)
def add_entry_name(archivoEntrada,protein_name_file):
"""
Adds entry names to the DataFrame based on an additional CSV file and performs additional processing.
Parameters:
- archivo_entrada (str): Path to the Excel file.
- protein_name_file (str): Path to the protein name CSV file.
Returns:
- None
"""
data = pd.read_excel(archivo_entrada)
dataB = pd.read_csv(protein_name_file, usecols=['Entry', "Entry_Name", "Protein_names", "Length"])
dataB = substitute_or_remove_prot_id(dataB, "na")
dataB = dataB.reindex(columns=['Entry', "Entry_Name", "Length", "Protein_names"])
datas = dataB[dataB["Entry"].isin(data["protein_id"])]
datas.to_csv(archivo_entrada + "_nombre.csv")
doo = data[~(data["protein_id"].isin(dataB["Entry"]))]
doo.to_csv("Proteinas_sin_nombre")
#data.assign(lenght=datas["Length"].to_list())
#data.assign(name=datas["Protein names"].to_list())
#data.to_csv(archivoEntrada+"_nombre.csv")
if __name__=="__main__":
#data=add_entry_name("data_nervous_genes_xf.xlsx","protein_name.csv")
#data=pd.read_excel("Data/data_nervous_genes_xf.xlsx")
#dd=pd.read_excel("Data/data_nervous_genes_xfal.xlsx")
#dds=pd.concat([data,dd])
#dds.to_excel("Data/data_nervous_genes_xfull.xlsx")
data=readData("Data/data_nervous_genes_xf.xlsx","","patronesIdenticos10Treat.csv","Lung01")
add_names_prot("ProtByPatternLung01.xlsx","patronesIdenticos10Treat.csv","protein_name.csv")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment