Skip to content
116 changes: 99 additions & 17 deletions src/MSA/Annotations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,28 +74,102 @@ end
# -------

# This function is useful because of the Julia issue #12495
function _filter(str::String, mask::AbstractArray{Bool})
function _filter(str::AbstractString, mask::AbstractArray{Bool})
@assert length(str) == length(mask) "The string and the mask must have the same length"
# data readable writable
buffer = IOBuffer(Array{UInt8}(undef, lastindex(str)), read = true, write = true)
# To start at the beginning of the buffer:
truncate(buffer, 0)
i = 1
for char in str
@inbounds if mask[i]
write(buffer, char)
end
buffer = IOBuffer(; sizehint = ncodeunits(str))
i = 0
@inbounds for c in str
i += 1
if mask[i]
write(buffer, c)
end
end
String(take!(buffer))
end

_filter(str::String, indexes::AbstractArray{Int}) = String(collect(str)[indexes])
function _filter(str::AbstractString, indexes::AbstractArray{Int})
isempty(indexes) && return ""
if isascii(str)
str[indexes] # fast path: ASCII => indices are valid string indices
else
chars = collect(str) # one Vector{Char}
io = IOBuffer(; sizehint = length(indexes))
@inbounds for i in indexes
write(io, chars[i])
end
String(take!(io))
end
end

"""
For filter column and sequence mapping of the format: ",,,,10,11,,12"
"""
_filter_mapping(str_map::String, mask) = join(split(str_map, ',')[mask], ',')
function _filter_mapping(str_map::AbstractString, mask::AbstractVector{Bool})
io = IOBuffer(; sizehint = ncodeunits(str_map))
wrote = false
nfields = 0
for part in eachsplit(str_map, ','; keepempty = true)
nfields += 1
if nfields > length(mask)
throw(DimensionMismatch("mask length $(length(mask)) < $nfields fields"))
end
if @inbounds mask[nfields]
wrote && write(io, ',')
write(io, part)
wrote = true
end
end
if nfields != length(mask)
throw(DimensionMismatch("mask length $(length(mask)) > $nfields fields"))
end
String(take!(io))
end

# Compute start/stop indices for all comma-separated fields in `str_map`.
function _field_ranges(str_map::AbstractString)
starts = Int[]
stops = Int[]

last = lastindex(str_map)
start = firstindex(str_map)
i = start

while i <= last
if @inbounds str_map[i] == ','
push!(starts, start)
push!(stops, start < i ? prevind(str_map, i) : start - 1)
start = nextind(str_map, i)
end
i = nextind(str_map, i)
end

# final field (possibly empty)
push!(starts, start)
push!(stops, start <= last ? last : start - 1)

return starts, stops
end

function _filter_mapping(str_map::AbstractString, mask::AbstractVector{<:Integer})
isempty(mask) && return ""

starts, stops = _field_ranges(str_map)
nentries = length(starts)

io = IOBuffer(; sizehint = ncodeunits(str_map))
first = true

for raw_idx in mask
idx = Int(raw_idx)
1 <= idx <= nentries || throw(BoundsError(mask, raw_idx))

first || write(io, ',')
first = false

s = @inbounds starts[idx]
e = @inbounds stops[idx]
s <= e && write(io, SubString(str_map, s, e)) # empty field ⇒ nothing
end

return String(take!(io))
end

"""
`filtersequences!(data::Annotations, ids::Vector{String}, mask::AbstractArray{Bool,1})`
Expand Down Expand Up @@ -133,20 +207,28 @@ function filtersequences!(
data
end

function _get_indexes(input_mask)
if eltype(input_mask) <: Bool
return findall(input_mask)
end
input_mask
end

"""
`filtercolumns!(data::Annotations, mask)`

It is useful for deleting column annotations (creating a subset in place).
"""
function filtercolumns!(data::Annotations, mask)
int_mask = _get_indexes(mask)
if length(data.residues) > 0
for (key, value) in data.residues
data.residues[key] = _filter(value, mask)
data.residues[key] = _filter(value, int_mask)
end
end
if length(data.columns) > 0
for (key, value) in data.columns
data.columns[key] = _filter(value, mask)
data.columns[key] = _filter(value, int_mask)
end
end
if length(data.sequences) > 0
Expand Down
31 changes: 12 additions & 19 deletions src/MSA/GeneralParserMethods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,7 @@ abstract type SequenceFormat <: AbstractSequenceFormat end
# ========

# It checks sequence lengths
function _fill_aln_seq_ann!(
aln,
seq_ann::Vector{String},
seq::String,
init::Int,
nres::Int,
i,
)
function _fill_aln_seq_ann!(aln, seq_ann::IOBuffer, seq::String, init::Int, nres::Int, i)
if length(seq) != nres
throw(
ErrorException(
Expand All @@ -35,29 +28,29 @@ function _fill_aln_seq_ann!(
end
j = 1
@inbounds for res in seq
aln[j, i] = res
aln[i, j] = res
j > 1 && write(seq_ann, ',')
if res != '-' && res != '.'
seq_ann[j] = string(init)
write(seq_ann, string(init))
init += 1
else
seq_ann[j] = ""
end
j += 1
end
join(seq_ann, ','), init - 1
String(take!(seq_ann)), init - 1
end

function _to_msa_mapping(sequences::Array{String,1})
nseq = size(sequences, 1)
nres = length(sequences[1])
aln = Array{Residue}(undef, nres, nseq)
aln = Array{Residue}(undef, nseq, nres)
mapp = Array{String}(undef, nseq)
seq_ann = Array{String}(undef, nres)
# sizehint should be enough to store the mappings for the titin (~35000 residues long)
seq_ann = IOBuffer(; append = true, sizehint = 6 * nres)
for i = 1:nseq
# It checks sequence lengths
mapp[i], last = _fill_aln_seq_ann!(aln, seq_ann, sequences[i], 1, nres, i)
end
msa = NamedArray(permutedims(aln, [2, 1]))
msa = NamedArray(aln)
# MSA constructors adds dimension names
# setdimnames!(msa, ("Seq","Col"))
(msa, mapp)
Expand All @@ -66,9 +59,9 @@ end
function _to_msa_mapping(sequences::Array{String,1}, ids)
nseq = size(sequences, 1)
nres = length(sequences[1])
aln = Array{Residue}(undef, nres, nseq)
aln = Array{Residue}(undef, nseq, nres)
mapp = Array{String}(undef, nseq)
seq_ann = Array{String}(undef, nres)
seq_ann = IOBuffer(; append = true, sizehint = 6 * nres)
sep = r"/|-"
for i = 1:nseq
fields = split(ids[i], sep)
Expand All @@ -94,7 +87,7 @@ function _to_msa_mapping(sequences::Array{String,1}, ids)
end
end
msa = NamedArray(
permutedims(aln, [2, 1]),
aln,
(
OrderedDict{String,Int}(zip(ids, 1:nseq)),
OrderedDict{String,Int}(string(i) => i for i = 1:nres),
Expand Down
38 changes: 38 additions & 0 deletions test/MSA/Annotations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,44 @@

@test MSA._filter(str_col, selector) == "dca"
@test MSA._filter_mapping(str_map, selector) == "14,13,11"
bool_mask = Bool[true, false, true, false]
@test MSA._filter_mapping(str_map, bool_mask) == "11,13"
starts, stops = MSA._field_ranges(str_map)
@test starts == [1, 4, 7, 10]
@test stops == [2, 5, 8, 11]

@test MSA._filter(str_col, Int[]) == "" # empty selection
unicode_str = "αβγδ" # non-ASCII characters
@test MSA._filter(unicode_str, [4, 2]) == "δβ"

sparse_map = "1,2,,,30,400"
sparse_bool = Bool[true, false, true, true, false, true]
@test MSA._filter_mapping(sparse_map, sparse_bool) == "1,,,400"
sparse_selector = [6, 3, 6]
@test MSA._filter_mapping(sparse_map, sparse_selector) == "400,,400"
sparse_starts, sparse_stops = MSA._field_ranges(sparse_map)
@test sparse_starts == [1, 3, 5, 6, 7, 10]
@test sparse_stops == [1, 3, 4, 5, 8, 12]

empty_map = ""
@test MSA._filter_mapping(empty_map, Bool[true]) == ""
@test MSA._filter_mapping(empty_map, [1]) == ""
empty_starts, empty_stops = MSA._field_ranges(empty_map)
@test empty_starts == Int[firstindex(empty_map)]
@test empty_stops == Int[0]

short_mask = Bool[true, false]
long_mask = Bool[true, false, true, false, true]
@test_throws DimensionMismatch MSA._filter_mapping(str_map, short_mask)
@test_throws DimensionMismatch MSA._filter_mapping(str_map, long_mask)

unicode_map = "αβ,γδ,,ζ"
unicode_starts, unicode_stops = MSA._field_ranges(unicode_map)
unicode_fields = [
s <= e ? String(SubString(unicode_map, s, e)) : "" for
(s, e) in zip(unicode_starts, unicode_stops)
]
@test unicode_fields == ["αβ", "γδ", "", "ζ"]
end

annot = Annotations()
Expand Down
Loading