diff --git a/Project.toml b/Project.toml index db9b07a7..109ac079 100644 --- a/Project.toml +++ b/Project.toml @@ -5,7 +5,6 @@ authors = ["Andreas Hildebrandt and contribut [deps] AutoHashEquals = "15f4f7f2-30c1-5605-9d31-71845cf9641f" -BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc" BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864" @@ -39,7 +38,6 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a" [compat] Aqua = "0.8.14" AutoHashEquals = "2" -BioStructures = "4.2" BioSymbols = "5" CSV = "0.10.13" CellListMap = "0.9.5" diff --git a/src/BiochemicalAlgorithms.jl b/src/BiochemicalAlgorithms.jl index 8d0e0ff5..f092b7fe 100644 --- a/src/BiochemicalAlgorithms.jl +++ b/src/BiochemicalAlgorithms.jl @@ -86,6 +86,13 @@ include("fileformats/pdb/pdb_general.jl") include("fileformats/pdb/pdb_writer.jl") end +include("fileformats/cif.jl") + +module MMCIFDetails +include("fileformats/mmcif/mmcif_reader.jl") +include("fileformats/mmcif/mmcif_writer.jl") +end + include("fileformats/sdfile.jl") # mappings diff --git a/src/fileformats/cif.jl b/src/fileformats/cif.jl new file mode 100644 index 00000000..b1717975 --- /dev/null +++ b/src/fileformats/cif.jl @@ -0,0 +1,260 @@ +# CIF 1.1 / CIF 2.0 Parser in Julia +# Includes in-memory model for structured access + +@enum CIFVersion v1_1 v2_0 +@enum ParserState begin + Start + InDataBlock + InLoopHeader + InLoopBody + InMultilineText + InTripleQuotedString + ExpectValue + Error +end + +struct CIFLoop + tags::Vector{String} + rows::Vector{Vector{String}} +end + +mutable struct CIFDataBlock + name::String + tags::Dict{String, String} + loops::Vector{CIFLoop} +end + +mutable struct CIFFile + version::CIFVersion + blocks::Dict{String, CIFDataBlock} +end + +mutable struct CIFParser + version::CIFVersion + state::ParserState + current_tag::Union{Nothing, String} + loop_tags::Vector{String} + loop_data::Vector{String} # flat accumulation of all loop values + line_number::Int + in_text::Bool + text_buffer::Vector{String} + triple_quote_delim::Union{Nothing, String} + current_block::Union{Nothing, CIFDataBlock} + file::CIFFile + pre_text_state::ParserState # state before entering text block +end + +function CIFParser() + CIFParser(v1_1, Start, nothing, String[], String[], 0, false, String[], nothing, nothing, CIFFile(v1_1, Dict()), Start) +end + +function parse_line!(parser::CIFParser, line::String) + parser.line_number += 1 + stripped = strip(line) + + if parser.line_number == 1 && startswith(stripped, "#\\#CIF_2.0") + parser.version = v2_0 + parser.file.version = v2_0 + return + end + + if isempty(stripped) || startswith(stripped, '#') + return + end + + if parser.version == v2_0 && !isnothing(parser.triple_quote_delim) + if occursin(parser.triple_quote_delim, line) + push!(parser.text_buffer, split(line, parser.triple_quote_delim)[1]) + store_key_value(parser, parser.current_tag, join(parser.text_buffer, "\n")) + parser.triple_quote_delim = nothing + parser.current_tag = nothing + parser.state = InDataBlock + else + push!(parser.text_buffer, line) + end + return + end + + if parser.in_text + if startswith(line, ";") + text_value = join(parser.text_buffer, "\n") + if parser.pre_text_state == InLoopBody + # Multiline text inside a loop — append to loop data + push!(parser.loop_data, text_value) + parser.state = InLoopBody + else + store_key_value(parser, parser.current_tag, text_value) + parser.state = InDataBlock + end + parser.current_tag = nothing + parser.text_buffer = String[] + parser.in_text = false + else + push!(parser.text_buffer, line) + end + return + end + + if startswith(stripped, "data_") + # Finalize any pending loop before switching data blocks + finalize_loop!(parser) + name = stripped[6:end] + parser.current_block = CIFDataBlock(name, Dict(), CIFLoop[]) + parser.file.blocks[name] = parser.current_block + parser.state = InDataBlock + return + elseif startswith(stripped, "loop_") + # Finalize any pending loop before starting a new one + finalize_loop!(parser) + parser.state = InLoopHeader + empty!(parser.loop_tags) + empty!(parser.loop_data) + return + elseif parser.state == InLoopHeader && startswith(stripped, "_") + push!(parser.loop_tags, String(stripped)) + return + elseif parser.state in (InLoopHeader, InLoopBody) && !startswith(stripped, "_") + # Check for semicolon text block in loop data + if startswith(line, ";") + parser.pre_text_state = InLoopBody + parser.in_text = true + parser.text_buffer = [line[2:end]] + parser.state = InMultilineText + return + end + parser.state = InLoopBody + values = parse_compound_values(String(stripped)) + append!(parser.loop_data, values) + return + elseif startswith(stripped, "_") + # If we were in a loop, finalize it first + if parser.state == InLoopBody + finalize_loop!(parser) + end + # Tag-value pair: may be on one line or split across two + tokens = parse_compound_values(String(stripped)) + if length(tokens) >= 2 + # Inline tag-value: _tag value + store_key_value(parser, tokens[1], join(tokens[2:end], " ")) + parser.state = InDataBlock + else + parser.current_tag = tokens[1] + parser.state = ExpectValue + end + return + elseif parser.state == ExpectValue + if parser.version == v2_0 && (startswith(line, "\"\"\"") || startswith(line, "'''")) + delim = startswith(line, "\"\"\"") ? "\"\"\"" : "'''" + content = replace(line, delim => "") + parser.triple_quote_delim = delim + parser.text_buffer = [content] + parser.state = InTripleQuotedString + return + elseif startswith(line, ";") + parser.pre_text_state = ExpectValue + parser.in_text = true + parser.text_buffer = [line[2:end]] + parser.state = InMultilineText + return + else + store_key_value(parser, parser.current_tag, String(stripped)) + parser.current_tag = nothing + parser.state = InDataBlock + return + end + else + parser.state = Error + @warn "Unexpected line at $(parser.line_number): $line" + end +end + +function parse_compound_values(line::String) + result = String[] + buffer = IOBuffer() + in_list = false + in_table = false + in_quote = false + quote_char = ' ' + + for c in line + if c in ('"', '\'') + if in_quote && c == quote_char + in_quote = false + push!(result, String(take!(buffer))) + elseif !in_quote + in_quote = true + quote_char = c + else + write(buffer, c) + end + elseif c == '[' + in_list = true + write(buffer, c) + elseif c == ']' + in_list = false + write(buffer, c) + push!(result, String(take!(buffer))) + elseif c == '{' + in_table = true + write(buffer, c) + elseif c == '}' + in_table = false + write(buffer, c) + push!(result, String(take!(buffer))) + elseif isspace(c) && !in_quote && !in_list && !in_table + if buffer.size > 0 + push!(result, String(take!(buffer))) + end + else + write(buffer, c) + end + end + if buffer.size > 0 + push!(result, String(take!(buffer))) + end + return result +end + +function store_key_value(parser::CIFParser, tag::Union{Nothing, String}, value::String) + tag === nothing && return + if parser.current_block !== nothing + parser.current_block.tags[tag] = value + end +end + +function finalize_loop!(parser::CIFParser) + if parser.current_block !== nothing && !isempty(parser.loop_tags) && !isempty(parser.loop_data) + ntags = length(parser.loop_tags) + nvals = length(parser.loop_data) + + if nvals % ntags != 0 + @warn "CIF loop has $(nvals) values for $(ntags) tags — not evenly divisible; dropping trailing partial row" + end + + # Reshape flat values into rows of ntags columns (drop incomplete trailing row) + nrows = nvals ÷ ntags + rows = Vector{Vector{String}}() + for r in 0:(nrows - 1) + push!(rows, parser.loop_data[r*ntags+1:(r+1)*ntags]) + end + + push!(parser.current_block.loops, CIFLoop(copy(parser.loop_tags), rows)) + end + empty!(parser.loop_tags) + empty!(parser.loop_data) +end + +function parse_cif(io::IO) + parser = CIFParser() + for line in eachline(io) + parse_line!(parser, line) + end + finalize_loop!(parser) + return parser.file +end + +function parse_cif_file(path::String) + open(path, "r") do io + parse_cif(io) + end +end diff --git a/src/fileformats/mmcif/mmcif_reader.jl b/src/fileformats/mmcif/mmcif_reader.jl new file mode 100644 index 00000000..d9eb0f97 --- /dev/null +++ b/src/fileformats/mmcif/mmcif_reader.jl @@ -0,0 +1,493 @@ +using BiochemicalAlgorithms +using BiochemicalAlgorithms: CIFFile, CIFDataBlock, CIFLoop, CIFParser, + parse_cif_file, parse_cif, parse_element_string, _fragment_variant +import BiochemicalAlgorithms: PDBDetails +using DataStructures: Deque + +""" + read_mmcif(fname_io::Union{AbstractString, IO}, ::Type{T} = Float32; create_coils::Bool = true) -> System{T} + +Read a PDBx/mmCIF file and return a System. + +Models are stored as frames, using the model number as `frame_id`. +""" +function read_mmcif(fname_io::Union{AbstractString, IO}, ::Type{T} = Float32; + create_coils::Bool = true) where {T <: Real} + cif = if fname_io isa AbstractString + parse_cif_file(fname_io) + else + parse_cif(fname_io) + end + + if isempty(cif.blocks) + error("mmCIF file contains no data blocks") + end + + block = first(values(cif.blocks)) + + # Set system name from data block name (or filename if available) + sys_name = if fname_io isa AbstractString + bn = basename(fname_io) + # strip extension + idx = findlast('.', bn) + isnothing(idx) ? bn : bn[1:idx-1] + else + block.name + end + + sys = System{T}(sys_name) + mol = Molecule(sys; name = sys_name) + + # Parse _atom_site loop + atom_loop = _find_loop(block, "_atom_site.") + if isnothing(atom_loop) + return sys + end + + _build_atoms!(sys, mol, atom_loop, T) + + # Build fragment cache for postprocessing + fragment_cache = Dict{PDBDetails.UniqueResidueID, Fragment{T}}() + for f in fragments(sys) + fragment_cache[PDBDetails.UniqueResidueID( + f.name, + parent_chain(f).name, + f.number, + get_property(f, :insertion_code, " ") + )] = f + end + + # Parse disulfide bonds from _struct_conn + ssbonds = _parse_ssbonds(block) + PDBDetails.postprocess_ssbonds_!(sys, ssbonds, fragment_cache) + + # Parse secondary structure from _struct_conf and _struct_sheet_range + ss_records = _parse_secondary_structures(block) + PDBDetails.postprocess_secondary_structures_!(sys, ss_records, fragment_cache, create_coils) + + sys +end + +# ─── Helpers ────────────────────────────────────────────────────────── + +"""Find a loop in the data block whose tags start with the given prefix.""" +function _find_loop(block::CIFDataBlock, prefix::String) + for loop in block.loops + if !isempty(loop.tags) && startswith(loop.tags[1], prefix) + return loop + end + end + return nothing +end + +"""Build a tag→column-index map for a loop.""" +@inline function _col_map(loop::CIFLoop) + Dict(tag => i for (i, tag) in enumerate(loop.tags)) +end + +"""Get a value from a row, returning `nothing` for CIF missing values (? and .).""" +@inline function _get(row::Vector{String}, col::Int) + v = row[col] + (v == "?" || v == ".") ? nothing : v +end + +"""Get a value with a default for missing.""" +@inline function _get(row::Vector{String}, col::Int, default) + v = _get(row, col) + isnothing(v) ? default : v +end + +"""Get an optional column index, returning nothing if the tag is not present.""" +@inline function _optcol(cols::Dict{String, Int}, tag::String) + get(cols, tag, nothing) +end + +"""Get a required column index, trying `primary` first then `fallback`. Returns nothing if neither exists.""" +@inline function _reqcol(cols::Dict{String, Int}, primary::String, fallback::String) + c = get(cols, primary, nothing) + isnothing(c) ? get(cols, fallback, nothing) : c +end + +"""Parse mmCIF formal charge string. Handles plain integers ("1", "-1") and sign-suffixed ("1+", "2-").""" +function _parse_formal_charge(s::Union{Nothing, String}) + isnothing(s) && return Int(0) + s = strip(s) + isempty(s) && return Int(0) + # Try plain integer first + v = tryparse(Int, s) + !isnothing(v) && return v + # Try sign-suffixed format: "1+", "2-", etc. + if length(s) >= 2 + sign_char = s[end] + num_part = s[1:end-1] + if sign_char == '+' || sign_char == '-' + n = tryparse(Int, num_part) + if !isnothing(n) + return sign_char == '-' ? -n : n + end + end + end + return Int(0) +end + +# ─── Atom Site Parsing ──────────────────────────────────────────────── + +function _build_atoms!(sys::System{T}, mol::Molecule{T}, loop::CIFLoop, ::Type{T}) where {T <: Real} + cols = _col_map(loop) + + # Required columns + c_group = cols["_atom_site.group_PDB"] + c_id = cols["_atom_site.id"] + c_symbol = cols["_atom_site.type_symbol"] + c_atom_name = cols["_atom_site.label_atom_id"] + c_alt_id = cols["_atom_site.label_alt_id"] + c_comp_id = cols["_atom_site.label_comp_id"] + c_asym_id = cols["_atom_site.label_asym_id"] + c_seq_id = cols["_atom_site.label_seq_id"] + c_x = cols["_atom_site.Cartn_x"] + c_y = cols["_atom_site.Cartn_y"] + c_z = cols["_atom_site.Cartn_z"] + + # Optional columns (use auth_* when available, fall back to label_*) + c_auth_asym = _optcol(cols, "_atom_site.auth_asym_id") + c_auth_comp = _optcol(cols, "_atom_site.auth_comp_id") + c_auth_seq = _optcol(cols, "_atom_site.auth_seq_id") + c_auth_atom = _optcol(cols, "_atom_site.auth_atom_id") + c_ins_code = _optcol(cols, "_atom_site.pdbx_PDB_ins_code") + c_occupancy = _optcol(cols, "_atom_site.occupancy") + c_bfactor = _optcol(cols, "_atom_site.B_iso_or_equiv") + c_charge = _optcol(cols, "_atom_site.pdbx_formal_charge") + c_model = _optcol(cols, "_atom_site.pdbx_PDB_model_num") + + current_chain::Union{Chain{T}, Nothing} = nothing + current_frag::Union{Fragment{T}, Nothing} = nothing + current_chain_id = "" + current_frag_key = ("", 0, "") # (comp_id, seq_id, ins_code) + + altloc_warning = false + first_altloc_id = Dict{Tuple{String, Int, String, String}, String}() # (chain, seq, ins, atom_name) -> first altloc + + for row in loop.rows + # Alternate location handling + alt_id_raw = row[c_alt_id] + alt_id = (alt_id_raw == "." || alt_id_raw == "?") ? nothing : alt_id_raw + + # Use auth_* fields when available (matches PDB convention) + chain_id = isnothing(c_auth_asym) ? row[c_asym_id] : _get(row, c_auth_asym, row[c_asym_id]) + comp_id = isnothing(c_auth_comp) ? row[c_comp_id] : _get(row, c_auth_comp, row[c_comp_id]) + atom_name = isnothing(c_auth_atom) ? row[c_atom_name] : _get(row, c_auth_atom, row[c_atom_name]) + + seq_id_str = isnothing(c_auth_seq) ? _get(row, c_seq_id, "0") : _get(row, c_auth_seq, _get(row, c_seq_id, "0")) + seq_id = tryparse(Int, seq_id_str) + if isnothing(seq_id) + seq_id = 0 + end + + ins_code = isnothing(c_ins_code) ? " " : _get(row, c_ins_code, " ") + + # Skip alternate locations beyond the first + if !isnothing(alt_id) + key = (chain_id, seq_id, ins_code, atom_name) + existing = get(first_altloc_id, key, nothing) + if isnothing(existing) + first_altloc_id[key] = alt_id + elseif alt_id != existing + if !altloc_warning + @warn "load_mmcif: alternate locations other than $(existing) are currently not supported. Affected records have been ignored!" + altloc_warning = true + end + continue + end + end + + # Chain + if isnothing(current_chain) || chain_id != current_chain_id + current_chain = Chain(mol; name = chain_id) + current_chain_id = chain_id + current_frag = nothing + current_frag_key = ("", 0, "") + end + + # Fragment + frag_key = (comp_id, seq_id, ins_code) + if isnothing(current_frag) || frag_key != current_frag_key + is_hetero = strip(row[c_group]) == "HETATM" + current_frag = Fragment(current_chain, seq_id; + name = comp_id, + variant = _fragment_variant(comp_id), + properties = Properties([ + :is_hetero_fragment => is_hetero, + :insertion_code => ins_code + ]) + ) + current_frag_key = frag_key + end + + # Atom + serial = parse(Int, row[c_id]) + element = parse_element_string(strip(row[c_symbol])) + + x = parse(T, row[c_x]) + y = parse(T, row[c_y]) + z = parse(T, row[c_z]) + + occupancy = isnothing(c_occupancy) ? T(1.0) : T(parse(Float64, _get(row, c_occupancy, "1.0"))) + tempfactor = isnothing(c_bfactor) ? T(0.0) : T(parse(Float64, _get(row, c_bfactor, "0.0"))) + + charge_str = isnothing(c_charge) ? nothing : _get(row, c_charge) + formal_charge = _parse_formal_charge(charge_str) + + model_num = isnothing(c_model) ? 1 : parse(Int, _get(row, c_model, "1")) + + is_hetero_atom = strip(row[c_group]) == "HETATM" + is_deuterium = strip(row[c_symbol]) == "D" + + flags = Flags() + is_hetero_atom && push!(flags, :is_hetero_atom) + is_deuterium && push!(flags, :is_deuterium) + + Atom(current_frag, serial, element; + name = atom_name, + r = Vector3{T}(x, y, z), + formal_charge = formal_charge, + properties = Properties([ + :tempfactor => tempfactor, + :occupancy => occupancy, + :is_hetero_atom => is_hetero_atom, + :insertion_code => ins_code + ]), + flags = flags, + frame_id = model_num + ) + end +end + +# ─── SSBond Parsing ────────────────────────────────────────────────── + +function _parse_ssbonds(block::CIFDataBlock) + ssbonds = Deque{PDBDetails.SSBondRecord}() + loop = _find_loop(block, "_struct_conn.") + isnothing(loop) && return ssbonds + + cols = _col_map(loop) + + c_type = get(cols, "_struct_conn.conn_type_id", nothing) + isnothing(c_type) && return ssbonds + + # Use auth fields for residue identification, fall back to label + c_p1_asym = _reqcol(cols, "_struct_conn.ptnr1_auth_asym_id", "_struct_conn.ptnr1_label_asym_id") + c_p1_comp = _reqcol(cols, "_struct_conn.ptnr1_auth_comp_id", "_struct_conn.ptnr1_label_comp_id") + c_p1_seq = _reqcol(cols, "_struct_conn.ptnr1_auth_seq_id", "_struct_conn.ptnr1_label_seq_id") + c_p2_asym = _reqcol(cols, "_struct_conn.ptnr2_auth_asym_id", "_struct_conn.ptnr2_label_asym_id") + c_p2_comp = _reqcol(cols, "_struct_conn.ptnr2_auth_comp_id", "_struct_conn.ptnr2_label_comp_id") + c_p2_seq = _reqcol(cols, "_struct_conn.ptnr2_auth_seq_id", "_struct_conn.ptnr2_label_seq_id") + + # All partner columns are required for SSBond parsing + if any(isnothing, (c_p1_asym, c_p1_comp, c_p1_seq, c_p2_asym, c_p2_comp, c_p2_seq)) + @warn "mmCIF _struct_conn loop is missing required partner columns; skipping SSBond parsing" + return ssbonds + end + + c_p1_ins = _optcol(cols, "_struct_conn.pdbx_ptnr1_PDB_ins_code") + c_p2_ins = _optcol(cols, "_struct_conn.pdbx_ptnr2_PDB_ins_code") + c_sym1 = _optcol(cols, "_struct_conn.ptnr1_symmetry") + c_sym2 = _optcol(cols, "_struct_conn.ptnr2_symmetry") + c_dist = _optcol(cols, "_struct_conn.pdbx_dist_value") + + n = 0 + for row in loop.rows + strip(row[c_type]) == "disulf" || continue + n += 1 + + p1_ins = isnothing(c_p1_ins) ? " " : _get(row, c_p1_ins, " ") + p2_ins = isnothing(c_p2_ins) ? " " : _get(row, c_p2_ins, " ") + + first_res = PDBDetails.UniqueResidueID( + strip(row[c_p1_comp]), + strip(row[c_p1_asym]), + parse(Int, strip(row[c_p1_seq])), + p1_ins + ) + second_res = PDBDetails.UniqueResidueID( + strip(row[c_p2_comp]), + strip(row[c_p2_asym]), + parse(Int, strip(row[c_p2_seq])), + p2_ins + ) + + sym1 = isnothing(c_sym1) ? 0 : _parse_symmetry_operator(_get(row, c_sym1, "0")) + sym2 = isnothing(c_sym2) ? 0 : _parse_symmetry_operator(_get(row, c_sym2, "0")) + dist = isnothing(c_dist) ? 0.0 : parse(Float64, _get(row, c_dist, "0.0")) + + push!(ssbonds, PDBDetails.SSBondRecord(n, first_res, second_res, sym1, sym2, dist)) + end + + return ssbonds +end + +"""Parse CIF symmetry operator string like '1_555' into an integer.""" +function _parse_symmetry_operator(s::String) + s = strip(s) + (s == "?" || s == "." || isempty(s)) && return 0 + # Try parsing as integer directly + v = tryparse(Int, replace(s, "_" => "")) + isnothing(v) ? 0 : v +end + +# ─── Secondary Structure Parsing ───────────────────────────────────── + +function _parse_secondary_structures(block::CIFDataBlock) + records = Deque{Union{PDBDetails.HelixRecord, PDBDetails.SheetRecord, PDBDetails.TurnRecord}}() + + _parse_helices!(records, block) + _parse_sheets!(records, block) + + return records +end + +function _parse_helices!(records, block::CIFDataBlock) + loop = _find_loop(block, "_struct_conf.") + isnothing(loop) && return + + cols = _col_map(loop) + + c_id = get(cols, "_struct_conf.id", nothing) + isnothing(c_id) && return + + # Use auth fields when available, fall back to label + c_beg_comp = _reqcol(cols, "_struct_conf.beg_auth_comp_id", "_struct_conf.beg_label_comp_id") + c_beg_asym = _reqcol(cols, "_struct_conf.beg_auth_asym_id", "_struct_conf.beg_label_asym_id") + c_beg_seq = _reqcol(cols, "_struct_conf.beg_auth_seq_id", "_struct_conf.beg_label_seq_id") + c_end_comp = _reqcol(cols, "_struct_conf.end_auth_comp_id", "_struct_conf.end_label_comp_id") + c_end_asym = _reqcol(cols, "_struct_conf.end_auth_asym_id", "_struct_conf.end_label_asym_id") + c_end_seq = _reqcol(cols, "_struct_conf.end_auth_seq_id", "_struct_conf.end_label_seq_id") + + # All residue columns are required + if any(isnothing, (c_beg_comp, c_beg_asym, c_beg_seq, c_end_comp, c_end_asym, c_end_seq)) + @warn "mmCIF _struct_conf loop is missing required residue columns; skipping helix parsing" + return + end + + c_helix_id = _optcol(cols, "_struct_conf.pdbx_PDB_helix_id") + c_helix_class = _optcol(cols, "_struct_conf.pdbx_PDB_helix_class") + c_details = _optcol(cols, "_struct_conf.details") + c_beg_ins = _optcol(cols, "_struct_conf.pdbx_beg_PDB_ins_code") + c_end_ins = _optcol(cols, "_struct_conf.pdbx_end_PDB_ins_code") + + n = 0 + for row in loop.rows + n += 1 + + helix_name = isnothing(c_helix_id) ? _get(row, c_id, "H$n") : _get(row, c_helix_id, "H$n") + beg_ins = isnothing(c_beg_ins) ? " " : _get(row, c_beg_ins, " ") + end_ins = isnothing(c_end_ins) ? " " : _get(row, c_end_ins, " ") + + initial = PDBDetails.UniqueResidueID( + strip(row[c_beg_comp]), + strip(row[c_beg_asym]), + parse(Int, strip(row[c_beg_seq])), + beg_ins + ) + terminal = PDBDetails.UniqueResidueID( + strip(row[c_end_comp]), + strip(row[c_end_asym]), + parse(Int, strip(row[c_end_seq])), + end_ins + ) + + helix_class = isnothing(c_helix_class) ? 1 : (tryparse(Int, _get(row, c_helix_class, "1")) === nothing ? 1 : parse(Int, _get(row, c_helix_class, "1"))) + details = isnothing(c_details) ? "" : _get(row, c_details, "") + + push!(records, PDBDetails.HelixRecord(n, helix_name, initial, terminal, helix_class, details)) + end +end + +function _parse_sheets!(records, block::CIFDataBlock) + loop = _find_loop(block, "_struct_sheet_range.") + isnothing(loop) && return + + cols = _col_map(loop) + + c_sheet_id = get(cols, "_struct_sheet_range.sheet_id", nothing) + c_range_id = get(cols, "_struct_sheet_range.id", nothing) + if isnothing(c_sheet_id) || isnothing(c_range_id) + @warn "mmCIF _struct_sheet_range loop is missing sheet_id or id columns; skipping sheet parsing" + return + end + + # Use auth fields when available, fall back to label + c_beg_comp = _reqcol(cols, "_struct_sheet_range.beg_auth_comp_id", "_struct_sheet_range.beg_label_comp_id") + c_beg_asym = _reqcol(cols, "_struct_sheet_range.beg_auth_asym_id", "_struct_sheet_range.beg_label_asym_id") + c_beg_seq = _reqcol(cols, "_struct_sheet_range.beg_auth_seq_id", "_struct_sheet_range.beg_label_seq_id") + c_end_comp = _reqcol(cols, "_struct_sheet_range.end_auth_comp_id", "_struct_sheet_range.end_label_comp_id") + c_end_asym = _reqcol(cols, "_struct_sheet_range.end_auth_asym_id", "_struct_sheet_range.end_label_asym_id") + c_end_seq = _reqcol(cols, "_struct_sheet_range.end_auth_seq_id", "_struct_sheet_range.end_label_seq_id") + + # All residue columns are required + if any(isnothing, (c_beg_comp, c_beg_asym, c_beg_seq, c_end_comp, c_end_asym, c_end_seq)) + @warn "mmCIF _struct_sheet_range loop is missing required residue columns; skipping sheet parsing" + return + end + + # If any required begin/end columns are missing, skip sheet parsing to avoid + # out-of-bounds access on row[...] (since 0 is not a valid index in Julia). + if c_beg_comp == 0 || c_beg_asym == 0 || c_beg_seq == 0 || + c_end_comp == 0 || c_end_asym == 0 || c_end_seq == 0 + return + end + + c_beg_ins = _optcol(cols, "_struct_sheet_range.pdbx_beg_PDB_ins_code") + c_end_ins = _optcol(cols, "_struct_sheet_range.pdbx_end_PDB_ins_code") + + # Try to get sense from _struct_sheet_order + sense_map = _parse_sheet_order_sense(block) + + for row in loop.rows + sheet_id = strip(row[c_sheet_id]) + range_id = parse(Int, strip(row[c_range_id])) + + beg_ins = isnothing(c_beg_ins) ? " " : _get(row, c_beg_ins, " ") + end_ins = isnothing(c_end_ins) ? " " : _get(row, c_end_ins, " ") + + initial = PDBDetails.UniqueResidueID( + strip(row[c_beg_comp]), + strip(row[c_beg_asym]), + parse(Int, strip(row[c_beg_seq])), + beg_ins + ) + terminal = PDBDetails.UniqueResidueID( + strip(row[c_end_comp]), + strip(row[c_end_asym]), + parse(Int, strip(row[c_end_seq])), + end_ins + ) + + sense = get(sense_map, (sheet_id, range_id), 0) + + push!(records, PDBDetails.SheetRecord(range_id, sheet_id, initial, terminal, sense)) + end +end + +"""Parse _struct_sheet_order to get sense values for each sheet range.""" +function _parse_sheet_order_sense(block::CIFDataBlock) + sense_map = Dict{Tuple{String, Int}, Int}() + loop = _find_loop(block, "_struct_sheet_order.") + isnothing(loop) && return sense_map + + cols = _col_map(loop) + c_sheet = cols["_struct_sheet_order.sheet_id"] + c_range2 = cols["_struct_sheet_order.range_id_2"] + c_sense = _optcol(cols, "_struct_sheet_order.sense") + + isnothing(c_sense) && return sense_map + + for row in loop.rows + sheet_id = strip(row[c_sheet]) + range_id = parse(Int, strip(row[c_range2])) + sense_str = _get(row, c_sense, "parallel") + sense = sense_str == "anti-parallel" ? -1 : (sense_str == "parallel" ? 1 : 0) + sense_map[(sheet_id, range_id)] = sense + end + + return sense_map +end diff --git a/src/fileformats/mmcif/mmcif_writer.jl b/src/fileformats/mmcif/mmcif_writer.jl new file mode 100644 index 00000000..91c92cd5 --- /dev/null +++ b/src/fileformats/mmcif/mmcif_writer.jl @@ -0,0 +1,253 @@ +using Printf + +using BiochemicalAlgorithms + +""" + write_mmcif_impl(io::IO, ac::AbstractAtomContainer{T}) + +Write an atom container as PDBx/mmCIF format to the given IO stream. +""" +function write_mmcif_impl(io::IO, ac::AbstractAtomContainer{T}) where T + name = replace(ac.name, r"\s+" => "_") + isempty(name) && (name = "unnamed") + + println(io, "data_", name) + println(io, "#") + + _write_atom_site(io, ac) + _write_struct_conn(io, ac) + _write_struct_conf(io, ac) + _write_struct_sheet_range(io, ac) +end + +# ─── CIF value quoting ─────────────────────────────────────────────── + +"""Quote a string value for CIF output.""" +function _cif_quote(s::AbstractString) + s = string(s) + isempty(s) && return "." + # Multiline values must use semicolon text blocks + if occursin('\n', s) || occursin('\r', s) + return ";\n$s\n;" + end + # No quoting needed for simple values + if !any(c -> isspace(c), s) && !startswith(s, '_') && !startswith(s, '#') && + !startswith(s, '\'') && !startswith(s, '"') && s != "." && s != "?" + return s + end + # Use single quotes if possible + if !occursin('\'', s) + return "'$s'" + end + # Use double quotes + if !occursin('"', s) + return "\"$s\"" + end + # Fall back to semicolon text block + return ";\n$s\n;" +end + +# ─── _atom_site ─────────────────────────────────────────────────────── + +function _write_atom_site(io::IO, ac::AbstractAtomContainer{T}) where T + all_atoms = atoms(ac) + isempty(all_atoms) && return + + println(io, "loop_") + println(io, "_atom_site.group_PDB") + println(io, "_atom_site.id") + println(io, "_atom_site.type_symbol") + println(io, "_atom_site.label_atom_id") + println(io, "_atom_site.label_alt_id") + println(io, "_atom_site.label_comp_id") + println(io, "_atom_site.label_asym_id") + println(io, "_atom_site.label_entity_id") + println(io, "_atom_site.label_seq_id") + println(io, "_atom_site.pdbx_PDB_ins_code") + println(io, "_atom_site.Cartn_x") + println(io, "_atom_site.Cartn_y") + println(io, "_atom_site.Cartn_z") + println(io, "_atom_site.occupancy") + println(io, "_atom_site.B_iso_or_equiv") + println(io, "_atom_site.pdbx_formal_charge") + println(io, "_atom_site.auth_seq_id") + println(io, "_atom_site.auth_comp_id") + println(io, "_atom_site.auth_asym_id") + println(io, "_atom_site.auth_atom_id") + println(io, "_atom_site.pdbx_PDB_model_num") + + # Build entity ID map (one entity per unique chain name) + entity_map = Dict{String, Int}() + entity_count = 0 + for c in chains(ac) + if !haskey(entity_map, c.name) + entity_count += 1 + entity_map[c.name] = entity_count + end + end + + for a in all_atoms + frag = parent_fragment(a) + chain = isnothing(frag) ? nothing : parent_chain(frag) + + group = is_hetero_atom(a) ? "HETATM" : "ATOM" + type_sym = string(a.element) + atom_name = _cif_quote(a.name) + alt_id = "." + + comp_id = isnothing(frag) ? "UNK" : frag.name + chain_id = isnothing(chain) ? "." : chain.name + entity_id = isnothing(chain) ? 1 : get(entity_map, chain.name, 1) + seq_id = isnothing(frag) ? 0 : frag.number + ins_code = isnothing(frag) ? "?" : _cif_quote(get_property(frag, :insertion_code, "?")) + + x = @sprintf("%.3f", a.r[1]) + y = @sprintf("%.3f", a.r[2]) + z = @sprintf("%.3f", a.r[3]) + + occ = @sprintf("%.2f", get_property(a, :occupancy, 1.0)) + bfac = @sprintf("%.2f", get_property(a, :tempfactor, 0.0)) + + charge = if a.formal_charge == 0 + "?" + elseif a.formal_charge > 0 + "$(a.formal_charge)+" + else + "$(-a.formal_charge)-" + end + + model_num = a.frame_id + + println(io, "$group $( a.number) $type_sym $atom_name $alt_id $comp_id $chain_id $entity_id $seq_id $ins_code $x $y $z $occ $bfac $charge $seq_id $comp_id $chain_id $atom_name $model_num") + end + + println(io, "#") +end + +# ─── _struct_conn (disulfide bonds) ────────────────────────────────── + +function _write_struct_conn(io::IO, ac::AbstractAtomContainer{T}) where T + # Find disulfide bonds + disulfides = filter(b -> has_flag(b, :TYPE__DISULPHIDE_BOND), bonds(ac)) + isempty(disulfides) && return + + println(io, "loop_") + println(io, "_struct_conn.id") + println(io, "_struct_conn.conn_type_id") + println(io, "_struct_conn.ptnr1_label_asym_id") + println(io, "_struct_conn.ptnr1_label_comp_id") + println(io, "_struct_conn.ptnr1_label_seq_id") + println(io, "_struct_conn.ptnr1_label_atom_id") + println(io, "_struct_conn.ptnr1_symmetry") + println(io, "_struct_conn.ptnr2_label_asym_id") + println(io, "_struct_conn.ptnr2_label_comp_id") + println(io, "_struct_conn.ptnr2_label_seq_id") + println(io, "_struct_conn.ptnr2_label_atom_id") + println(io, "_struct_conn.ptnr2_symmetry") + println(io, "_struct_conn.pdbx_dist_value") + + sys = parent_system(ac) + + for (i, b) in enumerate(disulfides) + a1 = atom_by_idx(sys, b.a1) + a2 = atom_by_idx(sys, b.a2) + f1 = parent_fragment(a1) + f2 = parent_fragment(a2) + c1 = parent_chain(a1) + c2 = parent_chain(a2) + + sym1 = get_property(b, :SYMMETRY_OPERATOR_0, 0) + sym2 = get_property(b, :SYMMETRY_OPERATOR_1, 0) + dist = get_property(b, :BOND_LENGTH, 0.0) + + sym1_str = sym1 == 0 ? "?" : string(sym1) + sym2_str = sym2 == 0 ? "?" : string(sym2) + dist_str = dist == 0.0 ? "?" : @sprintf("%.3f", dist) + + println(io, "disulf$(i) disulf $(c1.name) $(f1.name) $(f1.number) $(a1.name) $(sym1_str) $(c2.name) $(f2.name) $(f2.number) $(a2.name) $(sym2_str) $(dist_str)") + end + + println(io, "#") +end + +# ─── _struct_conf (helices) ────────────────────────────────────────── + +function _write_struct_conf(io::IO, ac::AbstractAtomContainer{T}) where T + helices = filter(ss -> ss.element == SecondaryStructureElement.Helix, secondary_structures(ac)) + isempty(helices) && return + + println(io, "loop_") + println(io, "_struct_conf.conf_type_id") + println(io, "_struct_conf.id") + println(io, "_struct_conf.pdbx_PDB_helix_id") + println(io, "_struct_conf.beg_label_comp_id") + println(io, "_struct_conf.beg_label_asym_id") + println(io, "_struct_conf.beg_label_seq_id") + println(io, "_struct_conf.pdbx_beg_PDB_ins_code") + println(io, "_struct_conf.end_label_comp_id") + println(io, "_struct_conf.end_label_asym_id") + println(io, "_struct_conf.end_label_seq_id") + println(io, "_struct_conf.pdbx_end_PDB_ins_code") + println(io, "_struct_conf.pdbx_PDB_helix_class") + println(io, "_struct_conf.details") + println(io, "_struct_conf.pdbx_PDB_helix_length") + + for ss in helices + frags = fragments(ss) + isempty(frags) && continue + + first_frag = first(frags) + last_frag = last(frags) + chain = parent_chain(ss) + + helix_class = get_property(ss, :HELIX_CLASS, 1) + details = _cif_quote(get_property(ss, :COMMENT, "?")) + helix_length = length(frags) + + beg_ins = get_property(first_frag, :insertion_code, "?") + end_ins = get_property(last_frag, :insertion_code, "?") + + println(io, "HELX_P HELX_P$(ss.number) $(ss.name) $(first_frag.name) $(chain.name) $(first_frag.number) $(_cif_quote(beg_ins)) $(last_frag.name) $(chain.name) $(last_frag.number) $(_cif_quote(end_ins)) $(helix_class) $(details) $(helix_length)") + end + + println(io, "#") +end + +# ─── _struct_sheet_range (sheets) ──────────────────────────────────── + +function _write_struct_sheet_range(io::IO, ac::AbstractAtomContainer{T}) where T + strands = filter(ss -> ss.element == SecondaryStructureElement.Strand, secondary_structures(ac)) + isempty(strands) && return + + println(io, "loop_") + println(io, "_struct_sheet_range.sheet_id") + println(io, "_struct_sheet_range.id") + println(io, "_struct_sheet_range.beg_label_comp_id") + println(io, "_struct_sheet_range.beg_label_asym_id") + println(io, "_struct_sheet_range.beg_label_seq_id") + println(io, "_struct_sheet_range.pdbx_beg_PDB_ins_code") + println(io, "_struct_sheet_range.end_label_comp_id") + println(io, "_struct_sheet_range.end_label_asym_id") + println(io, "_struct_sheet_range.end_label_seq_id") + println(io, "_struct_sheet_range.pdbx_end_PDB_ins_code") + + for ss in strands + frags = fragments(ss) + isempty(frags) && continue + + first_frag = first(frags) + last_frag = last(frags) + chain = parent_chain(ss) + + # Extract sheet name from "SheetName:RangeId" format used by PDB reader + name_parts = split(ss.name, ":") + sheet_id = length(name_parts) >= 1 ? name_parts[1] : "S1" + + beg_ins = get_property(first_frag, :insertion_code, "?") + end_ins = get_property(last_frag, :insertion_code, "?") + + println(io, "$(sheet_id) $(ss.number) $(first_frag.name) $(chain.name) $(first_frag.number) $(_cif_quote(beg_ins)) $(last_frag.name) $(chain.name) $(last_frag.number) $(_cif_quote(end_ins))") + end + + println(io, "#") +end diff --git a/src/fileformats/pdb.jl b/src/fileformats/pdb.jl index e985b010..b9f358fb 100644 --- a/src/fileformats/pdb.jl +++ b/src/fileformats/pdb.jl @@ -5,21 +5,6 @@ export write_mmcif, write_pdb -using BioStructures: - read, - writemmcif, - writepdb, - collectatoms, - collectchains, - collectresidues, - PDBFormat, - MMCIFFormat, - MolecularStructure, - Model, - unsafe_addatomtomodel!, - AtomRecord, - fixlists! - using Printf function is_hetero_atom(a::Atom) @@ -27,7 +12,7 @@ function is_hetero_atom(a::Atom) has_flag(a, :is_hetero_atom) || isnothing(f) || !is_amino_acid(f) end -function parse_element_string(es::String) +function parse_element_string(es::AbstractString) result = Elements.Unknown # handle special cases @@ -75,118 +60,6 @@ function extract_element(pdb_element::String, atom_name::String) return element end -function Base.convert(::Type{System{T}}, orig_pdb::MolecularStructure) where T - orig_df = DataFrame(collectatoms(orig_pdb)) - - # then, convert to our representation - sys = System{T}(orig_pdb.name) - mol = Molecule(sys; name = sys.name) - - ### convert the atom positions - r = Vector3{T}.(T.(orig_df.x), T.(orig_df.y), T.(orig_df.z)) - - ### extracting the elements is a little more complicated than it could be, - ### as the DataFrame-conversion strips whitespaces from atom names - elements = extract_element.(orig_df.element, getproperty.(collectatoms(orig_pdb, expand_disordered=true), :name)) - - atoms = DataFrame( - number=orig_df.serial, - name=orig_df.atomname, - element=elements, - r = r, - formal_charge = orig_df.charge - ) - - # convert other columns of interest to atom properties - atoms.properties = Properties.( - collect( - zip( - Pair.(:tempfactor, orig_df.tempfactor), - Pair.(:occupancy, orig_df.occupancy), - Pair.(:is_hetero_atom, orig_df.ishetero), - Pair.(:insertion_code, orig_df.inscode), - Pair.(:alternate_location_id, orig_df.altlocid) - ) - ) - ) - - atoms.flags = map( - h -> h ? Flags([:is_hetero_atom]) : Flags(), - orig_df.ishetero - ) .∪ map( - d -> d ? Flags([:is_deuterium]) : Flags(), - orig_df.element .== "D" - ) - - atoms.frame_id = orig_df.modelnumber - atoms.chain_idx = orig_df.chainid - atoms.fragment_idx = orig_df.resnumber - atoms.inscode = orig_df.inscode - - # note: we will remove this column as soon as we have filtered out alternates - atoms.altlocid = orig_df.altlocid - - # collect fragment information - for orig_chain in collectchains(orig_pdb) - chain = Chain(mol; name = orig_chain.id) - for orig_frag in collectresidues(orig_chain) - Fragment(chain, orig_frag.number; - name = orig_frag.name, - properties = Properties([ - :is_hetero_fragment => orig_frag.het_res, - :insertion_code => orig_frag.ins_code - ]), - variant = _fragment_variant(strip(orig_frag.name)) - ) - end - end - - # now, handle alternate location ids - - # we try to be as tolerant as possible, as the PDB file format does not - # seem to formalize many restrictions here. - # general idea: - # - for each atom that has alternative locations, find them - # - find the smallest alternate location id and use this as the base case - # - store all other variants as properties - all_altlocs = groupby(filter(:altlocid => !=(' '), atoms, view=true), [:chain_idx, :fragment_idx, :name]) - for altlocs in all_altlocs - sorted_altlocs = sort(altlocs, :altlocid, view=true) - - base_case = sorted_altlocs[1, :] - base_case.altlocid = ' ' - - if nrow(sorted_altlocs) > 1 - for altloc in eachrow(sorted_altlocs[2:end, :]) - atoms.properties[base_case.number][Symbol("alternate_location_$(altloc.altlocid)")] = altloc - end - end - end - - # drop all alternates - atoms = filter(:altlocid => ==(' '), atoms) - - # add all remaining atoms to the system - grp_atoms = groupby(atoms, [:chain_idx, :fragment_idx, :inscode]) - for frag in fragments(mol) - for atom in eachrow(grp_atoms[( - chain_idx = parent_chain(frag).name, - fragment_idx = frag.number, - inscode = frag.properties[:insertion_code] - )]) - Atom(frag, atom.number, atom.element; - name = atom.name, - r = atom.r, - properties = atom.properties, - flags = atom.flags, - frame_id = atom.frame_id - ) - end - end - - sys -end - """ load_pdb(io::IO, ::Type{T} = Float32) -> System{T} @@ -255,63 +128,9 @@ Read a PDBx/mmCIF file. !!! note Models are stored as frames, using the model number as `frame_id`. """ -function load_mmcif(fname_io::Union{AbstractString, IO}, ::Type{T} = Float32) where {T <: Real} - # TODO: how to handle disordered atoms properly? - - # first, read the structure using BioStructures.jl - orig_mmcif = read(fname_io, MMCIFFormat) - convert(System{T}, orig_mmcif) -end - -function _to_atom_record(a::Atom) - # TODO: handle alternative location identifiers! - f = parent_fragment(a) - - AtomRecord( - is_hetero_atom(a), - a.number, - @sprintf("%4s", a.name), - ' ', - f.name, - parent_chain(a).name, - f.number, - only(get_property(a, :insertion_code, ' ')), - Vector{Float64}(a.r), - get_property(a, :occupancy, 1.0), - get_property(a, :tempfactor, 0.0), - string(a.element), - string(a.formal_charge) - ) -end - -function Base.convert(::Type{MolecularStructure}, ac::AbstractAtomContainer{T}) where T - # Build a MolecularStructure and add to it incrementally - struc = MolecularStructure(ac.name) - - # figure out if all molecules in the atom container have the same frames - sys_frame_ids = frame_ids(ac) - - if ac isa System{T} - for m in molecules(ac) - if frame_ids(m) != sys_frame_ids - error("pdb.jl: cannot convert System containing molecules with different numbers of frames") - end - end - end - - for (i, frame_id) in enumerate(sys_frame_ids) - struc[i] = Model(i, struc) - - for a in atoms(ac; frame_id=frame_id) - unsafe_addatomtomodel!( - struc[i], - _to_atom_record(a) - ) - end - end - - fixlists!(struc) - struc +function load_mmcif(fname_io::Union{AbstractString, IO}, ::Type{T} = Float32; + create_coils::Bool = true) where {T <: Real} + MMCIFDetails.read_mmcif(fname_io, T; create_coils=create_coils) end function write_pdb(io::IO, ac::Union{Chain{T}, Fragment{T}}) where T @@ -359,7 +178,10 @@ end Save an atom container as PDBx/mmCIF file. """ -function write_mmcif(fname_io::Union{AbstractString, IO}, ac::AbstractAtomContainer) - ps = convert(MolecularStructure, ac) - writemmcif(fname_io, ps) +function write_mmcif(io::IO, ac::AbstractAtomContainer) + MMCIFDetails.write_mmcif_impl(io, ac) +end + +function write_mmcif(fname::AbstractString, ac::AbstractAtomContainer) + open(io -> write_mmcif(io, ac), fname, "w") end diff --git a/src/fileformats/pdb/pdb_general.jl b/src/fileformats/pdb/pdb_general.jl index 32e99d57..b6e0b678 100644 --- a/src/fileformats/pdb/pdb_general.jl +++ b/src/fileformats/pdb/pdb_general.jl @@ -546,8 +546,12 @@ function interpret_record(record_type, tag, record_data...; sys, pdb_info, kwarg push!(pdb_info.records, PDBRecord(tag, record_data)) end -function postprocess_ssbonds_!(sys, pdb_info, fragment_cache) - for ssbond in pdb_info.ssbonds +function postprocess_ssbonds_!(sys, pdb_info::PDBInfo, fragment_cache) + postprocess_ssbonds_!(sys, pdb_info.ssbonds, fragment_cache) +end + +function postprocess_ssbonds_!(sys, ssbonds, fragment_cache) + for ssbond in ssbonds f1 = fragment_cache[ssbond.first] f2 = fragment_cache[ssbond.second] @@ -584,8 +588,12 @@ function postprocess_ssbonds_!(sys, pdb_info, fragment_cache) end end -function postprocess_secondary_structures_!(sys, pdb_info, fragment_cache, create_coils) - for ss in pdb_info.secondary_structures +function postprocess_secondary_structures_!(sys, pdb_info::PDBInfo, fragment_cache, create_coils) + postprocess_secondary_structures_!(sys, pdb_info.secondary_structures, fragment_cache, create_coils) +end + +function postprocess_secondary_structures_!(sys, ss_records, fragment_cache, create_coils) + for ss in ss_records initial_res = get(fragment_cache, ss.initial_residue, nothing) terminal_res = get(fragment_cache, ss.terminal_residue, nothing) diff --git a/test/fileformats/test_pdb.jl b/test/fileformats/test_pdb.jl index 8b3c9523..e5feecd7 100644 --- a/test/fileformats/test_pdb.jl +++ b/test/fileformats/test_pdb.jl @@ -1,11 +1,4 @@ @testitem "Read PDB" begin - using BioStructures: - PDBFormat, - countatoms, - countchains, - countresidues, - read - for T in [Float32, Float64] sys = load_pdb(ball_data_path("../test/data/bpti.pdb"), T) @test sys isa System{T} @@ -181,28 +174,47 @@ end for T in [Float32, Float64] sys = load_mmcif(ball_data_path("../test/data/5pti.cif"), T) @test sys isa System{T} - @test sys.name == "5pti.cif" + @test sys.name == "5pti" @test natoms(sys) == 1087 - @test nbonds(sys) == 0 + @test nbonds(sys) == 3 # 3 disulfide bonds from _struct_conn @test nmolecules(sys) == 1 @test nchains(sys) == 1 @test nfragments(sys) == 123 @test nnucleotides(sys) == 0 @test nresidues(sys) == 58 - sys = open(io -> load_mmcif(io, T), ball_data_path("../test/data/5pti.cif")) - @test sys isa System{T} - @test_broken sys.name == "5pti.cif" # BioStructures does not set a name here... - @test natoms(sys) == 1087 - @test nbonds(sys) == 0 - @test nmolecules(sys) == 1 - @test nchains(sys) == 1 - @test nfragments(sys) == 123 - @test nnucleotides(sys) == 0 - @test nresidues(sys) == 58 + # verify disulfide bonds have correct flags + for b in bonds(sys) + @test has_flag(b, :TYPE__DISULPHIDE_BOND) + end + + # verify coordinates of first atom (ARG-1 N) + a1 = first(atoms(sys)) + @test a1.name == "N" + @test a1.element == Elements.N + @test isapprox(a1.r, Vector3{T}(32.231, 15.281, -13.143); atol=T(0.001)) + + # secondary structures (2 helices + 3 sheet ranges + coils) + @test nsecondary_structures(sys) > 0 + + # IO loading + sys2 = open(io -> load_mmcif(io, T), ball_data_path("../test/data/5pti.cif")) + @test sys2 isa System{T} + @test sys2.name == "5PTI" # from data block name when reading from IO + @test natoms(sys2) == 1087 + @test nbonds(sys2) == 3 + @test nmolecules(sys2) == 1 + @test nchains(sys2) == 1 + @test nfragments(sys2) == 123 + @test nnucleotides(sys2) == 0 + @test nresidues(sys2) == 58 end end +@testitem "Read PDBx/mmCIF nonexistent" begin + @test_throws SystemError load_mmcif("nonexistent_file.cif") +end + @testitem "Write PDBx/mmCIF" begin for T in [Float32, Float64] sys = System{T}() @@ -224,6 +236,11 @@ end @test first(chains(sys2)).name == "A" @test nmolecules(sys2) == 1 + # coordinate round-trip + for (a1, a2) in zip(atoms(sys), atoms(sys2)) + @test isapprox(a1.r, a2.r; atol=T(0.001)) + end + open(io -> write_mmcif(io, sys), fname, "w") sys2 = load_mmcif(fname, T) @test sys2 isa System{T}