Comparing Strings and Finding Differencies

I have done this so far but i cant get the output "mutation" and cant make a table
Ref= 'accatttacggttagtcctg';
gene_sequences={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15; 'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
N=length(Ref);
results=zeros(1,N);
n=0;
for k=1:2:30
str_var= gene_sequences(k:k+1);
str_var=char(str_var(2));
for i=1:N
if strcmp(Ref(i),str_var(i))
result (i)=0;
else
result(i)=1;
end
if nnz(result)==0
pos_mut=0;
else
pos_mut=find(result==1);
end
end
n=n+1;
fprintf ('%s\t patient %d\t no mutation\t %d\n',str_var, n, pos_mut)
end

 採用された回答

Voss
Voss 2022 年 4 月 18 日
編集済み: Voss 2022 年 4 月 18 日

1 投票

Your calculation of the positions of mutations pos_mut is ok, but it should be done only after result is completely determined; otherwise you are doing unnecessary operations.
Ref= 'accatttacggttagtcctg';
gene_sequences={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15; 'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
N=length(Ref);
result=zeros(1,N);
n=0;
for k=1:2:30
str_var= gene_sequences(k:k+1);
str_var=char(str_var(2));
for i=1:N
if strcmp(Ref(i),str_var(i))
result(i)=0;
else
result(i)=1;
end
end
% do the calculation of 'pos_mut' only after 'result'
% has been completely calculated:
if nnz(result)==0
pos_mut=0;
else
pos_mut=find(result==1);
end
n=n+1;
fprintf ('%s\t patient %d\t no mutation\t %d\n',str_var, n, pos_mut)
end
accgtttacggttagtcctg patient 1 no mutation 4 accatttacggttagtcctg patient 2 no mutation 0 accatttacggttagtcctg patient 3 no mutation 0 accatttacggttagtcctg patient 4 no mutation 0 accatttacggttagtcctg patient 5 no mutation 0 accatttacagttagtcctg patient 6 no mutation 10 accatttacggttagtcctg patient 7 no mutation 0 accatttacggtcagtcctg patient 8 no mutation 13 accatttacggttagtcctg patient 9 no mutation 0 accatttacggttagtcctg patient 10 no mutation 0 accatttacggttagtcctg patient 11 no mutation 0 accatttacggttagttctg patient 12 no mutation 17 accatttacggttagtcctg patient 13 no mutation 0 accatttacggttagtcctg patient 14 no mutation 0 accatttacggttagtcctg patient 15 no mutation 0
But it can be simplified:
Ref= 'accatttacggttagtcctg';
gene_sequences={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15; 'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
N=length(Ref);
n=0;
for k=1:2:30
str_var= gene_sequences(k:k+1);
str_var=char(str_var(2));
pos_mut = find(Ref ~= str_var); % you can compare elements of two char arrays like this, if they are the same length (just like any other type of array)
if isempty(pos_mut)
pos_mut = 0;
end
n=n+1;
fprintf ('%s\t patient %d\t no mutation\t %d\n',str_var, n, pos_mut)
end
accgtttacggttagtcctg patient 1 no mutation 4 accatttacggttagtcctg patient 2 no mutation 0 accatttacggttagtcctg patient 3 no mutation 0 accatttacggttagtcctg patient 4 no mutation 0 accatttacggttagtcctg patient 5 no mutation 0 accatttacagttagtcctg patient 6 no mutation 10 accatttacggttagtcctg patient 7 no mutation 0 accatttacggtcagtcctg patient 8 no mutation 13 accatttacggttagtcctg patient 9 no mutation 0 accatttacggttagtcctg patient 10 no mutation 0 accatttacggttagtcctg patient 11 no mutation 0 accatttacggttagttctg patient 12 no mutation 17 accatttacggttagtcctg patient 13 no mutation 0 accatttacggttagtcctg patient 14 no mutation 0 accatttacggttagtcctg patient 15 no mutation 0
And I guess you'd rather use the first row of gene_sequences instead of assuming that patient number always goes 1, 2, 3, ..., and also have the result say "no mutation" only when there is no mutation:
Ref= 'accatttacggttagtcctg';
gene_sequences={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15; 'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
for k = 1:size(gene_sequences,2) % loop over columns of gene_sequences
str_var = gene_sequences(:,k); % use both elements of column k: first element is patient number
pos_mut = find(Ref ~= str_var{2}); % second element is nucleotide sequence
if isempty(pos_mut)
pos_mut = 0;
str_mut = 'no mutation';
else
str_mut = 'mutation';
end
fprintf('%s\t patient %d\t %-12s\t %d\n', str_var{2}, str_var{1}, str_mut, pos_mut)
end
accgtttacggttagtcctg patient 1 mutation 4 accatttacggttagtcctg patient 2 no mutation 0 accatttacggttagtcctg patient 3 no mutation 0 accatttacggttagtcctg patient 4 no mutation 0 accatttacggttagtcctg patient 5 no mutation 0 accatttacagttagtcctg patient 6 mutation 10 accatttacggttagtcctg patient 7 no mutation 0 accatttacggtcagtcctg patient 8 mutation 13 accatttacggttagtcctg patient 9 no mutation 0 accatttacggttagtcctg patient 10 no mutation 0 accatttacggttagtcctg patient 11 no mutation 0 accatttacggttagttctg patient 12 mutation 17 accatttacggttagtcctg patient 13 no mutation 0 accatttacggttagtcctg patient 14 no mutation 0 accatttacggttagtcctg patient 15 no mutation 0
And put the results into a cell array instead of fprintf-ing them to the command line:
Ref= 'accatttacggttagtcctg';
gene_sequences={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15; 'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
N = size(gene_sequences,2);
result = cell(N,4);
result(:,1) = gene_sequences(2,:);
result(:,2) = sprintfc('patient %d',[gene_sequences{1,:}]);
for k = 1:N
str_var = gene_sequences(:,k); % use both elements of column k: first element is patient number
pos_mut = find(Ref ~= str_var{2}); % second element is nucleotide sequence
if isempty(pos_mut)
pos_mut = 0;
str_mut = 'no mutation';
else
str_mut = 'mutation';
end
result(k,[3 4]) = {str_mut pos_mut};
end
result
result = 15×4 cell array
{'accgtttacggttagtcctg'} {'patient 1' } {'mutation' } {[ 4]} {'accatttacggttagtcctg'} {'patient 2' } {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 3' } {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 4' } {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 5' } {'no mutation'} {[ 0]} {'accatttacagttagtcctg'} {'patient 6' } {'mutation' } {[10]} {'accatttacggttagtcctg'} {'patient 7' } {'no mutation'} {[ 0]} {'accatttacggtcagtcctg'} {'patient 8' } {'mutation' } {[13]} {'accatttacggttagtcctg'} {'patient 9' } {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 10'} {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 11'} {'no mutation'} {[ 0]} {'accatttacggttagttctg'} {'patient 12'} {'mutation' } {[17]} {'accatttacggttagtcctg'} {'patient 13'} {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 14'} {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 15'} {'no mutation'} {[ 0]}

1 件のコメント

Row Sarox
Row Sarox 2022 年 4 月 18 日
Thanks. I am just started learning matlab, therefore i couldn't think the way you did.

サインインしてコメントする。

その他の回答 (1 件)

Stephen23
Stephen23 2022 年 4 月 18 日
編集済み: Stephen23 2022 年 4 月 18 日

1 投票

Your current approach and other simple code based on STRCMP or comparing entire character vectors is not robust to insertions, deletions, and strings of different lengths. A much more robust approach uses the Bioinformatics Toolbox, for example https://www.mathworks.com/help/bioinfo/ref/nwalign.html to get a robust global sequence match.
ref = 'accatttacggttagtcctg';
inp = {'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
vec = 1:numel(inp);
[idx,msg] = cellfun(@(s)myfun(s,ref),inp(:),'uni',0);
tmp = compose('patient %u',1:numel(inp));
out = [inp(:),tmp(:),msg,idx]
out = 15×4 cell array
{'accgtttacggttagtcctg'} {'patient 1' } {'mutation' } {[ 4]} {'accatttacggttagtcctg'} {'patient 2' } {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 3' } {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 4' } {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 5' } {'no mutation'} {[ 0]} {'accatttacagttagtcctg'} {'patient 6' } {'mutation' } {[10]} {'accatttacggttagtcctg'} {'patient 7' } {'no mutation'} {[ 0]} {'accatttacggtcagtcctg'} {'patient 8' } {'mutation' } {[13]} {'accatttacggttagtcctg'} {'patient 9' } {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 10'} {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 11'} {'no mutation'} {[ 0]} {'accatttacggttagttctg'} {'patient 12'} {'mutation' } {[17]} {'accatttacggttagtcctg'} {'patient 13'} {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 14'} {'no mutation'} {[ 0]} {'accatttacggttagtcctg'} {'patient 15'} {'no mutation'} {[ 0]}
function [idx,msg] = myfun(st1,st2)
[~,aln] = nwalign(st1,st2); % requires Bioinformatics Toolbox.
idx = find(aln(2,:)~='|');
msg = 'mutation';
if isempty(idx)
msg = 'no mutation';
idx = 0;
end
end

カテゴリ

ヘルプ センター および File ExchangeApp Building についてさらに検索

質問済み:

2022 年 4 月 17 日

編集済み:

2022 年 4 月 18 日

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!

Translated by