Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
SD_networks/modules_SDs.jl
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1035 lines (873 sloc)
53.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module SDmoduleInit | |
using IntervalTrees, DataFrames | |
export Initialize_SD_object, Initialize_pyr_object, Interval_object, gap_value, Itree_from_DF | |
gap_value = 5; | |
mutable struct Interval_object | |
int_DF::DataFrame | |
int_tree::Dict{Int64, IntervalTrees.IntervalBTree} | |
end | |
function Read_SDs_file_to_DF(filename::String, comp::Int = 1) | |
SD_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[], | |
chr2 = Int64[], coor_s2 = Int64[], coor_e2 = Int64[], identity=Float64[]) | |
open(filename) do sdfile | |
lines = readlines(sdfile); | |
if comp == 1 | |
#coors_str = map(x -> strip(x) |> split |> x -> (length(x) >= 26 ? x[[1,2,3,7,8,9,26]] : nothing), lines) | |
coors_str = map(x -> strip(x) |> split |> x -> x[[1,2,3,7,8,9,26]], lines) | |
elseif comp == 2 | |
#coors_str = map(x -> strip(x) |> split |> x -> ((length(x) >= 26) && !(startswith(x[1], '#')) ? x[[1,2,3,4,5,6,22]] : nothing), lines) | |
coors_str = map(x -> strip(x) |> split |> x -> x[[1,2,3,4,5,6,22]], lines) | |
end | |
map(x -> push!(SD_df ,[parse(Int64, replace(x[1], r"chr" => "")) [parse.(Int64, x[2:3])...]' parse(Int64, replace(x[4], r"chr" => "")) [parse.(Int64, x[5:6])...]' parse(Float64, x[7])]), coors_str) | |
end | |
#if comp == 2 | |
# SD_df = filter_SEDEF_DF!(SD_df) | |
#end | |
return SD_df | |
end | |
function Filter_SDs_dataframe(SD_df::DataFrame, comp::String = "left") | |
SD_df_filt = similar(SD_df, 0) | |
for line in eachrow(SD_df) | |
chr1 = line[:chr] | |
chr2 = line[:chr2] | |
if comp == "left" | |
if chr1 < chr2 | |
push!(SD_df_filt, line) | |
elseif (chr1 == chr2) & (line[:coor_s] < line[:coor_s2]) | |
push!(SD_df_filt, line) | |
end | |
elseif comp == "both" | |
push!(SD_df_filt, line) | |
end | |
end | |
SD_df_filt[!, :ids] = map(x -> "id"*string(x), 1:size(SD_df_filt)[1]) | |
return SD_df_filt | |
end | |
function Itree_from_DF(DF::DataFrame, comp::String = "SD_IntTree") | |
Itree::Dict{Int64, IntervalTrees.IntervalBTree} = Dict() | |
if comp == "SD_IntTree" | |
for line in eachrow(DF) | |
Itree[line[:chr]] = get(Itree, line[:chr], IntervalTree{Int, IntervalValue{Int, String}}()) | |
push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], line[:ids])) | |
Itree[line[:chr2]] = get(Itree, line[:chr2], IntervalTree{Int, IntervalValue{Int, String}}()) | |
push!(Itree[line[:chr2]], IntervalValue(line[:coor_s2], line[:coor_e2], line[:ids])) | |
end | |
elseif comp == "Pyrs_IntTree" | |
for line in eachrow(DF) | |
Itree = Itree_merge_overlaps(Itree, line[:chr], line[:coor_s], line[:coor_e], gap_value) | |
Itree = Itree_merge_overlaps(Itree, line[:chr2], line[:coor_s2], line[:coor_e2], gap_value) | |
end | |
end | |
return Itree | |
end | |
function Check_sharp_borders_to_ids(SD_df::DataFrame, SD_tree::Dict, comp::String, gap::Int64, gap2::Int64) | |
ind_arr = ones(size(SD_df)[1]) | |
for (i, line) in enumerate(eachrow(SD_df)) | |
if comp == "first" | |
chr, coor_s, coor_e = line[:chr], line[:coor_s], line[:coor_e] | |
elseif comp == "second" | |
chr, coor_s, coor_e = line[:chr2], line[:coor_s2], line[:coor_e2] | |
end | |
over = collect(intersect(SD_tree[chr], (coor_s - gap, coor_e + gap))) | |
for inter in over | |
if ((inter.first - gap <= coor_s) && (coor_s <= inter.first + gap)) || ((inter.last - gap <= coor_e) && (coor_e <= inter.last + gap)) | |
if coor_e - coor_s + gap2 < inter.last - inter.first # don't have to check the id with gap3 | |
ind_arr[i] = 0 | |
end | |
end | |
if ((inter.last - gap <= coor_s) && (coor_s <= inter.last + gap)) || ((inter.first - gap <= coor_e) && (coor_e <= inter.first + gap)) | |
ind_arr[i] = 1 | |
break | |
#fd = 4 | |
end | |
end | |
end | |
return ind_arr | |
end | |
function Filt_sharp_borders_SDs(SD_obj::Interval_object, gap::Int64=15, gap2::Int64=100) | |
SD_df, SD_tree = SD_obj.int_DF, SD_obj.int_tree | |
ind_arr = Check_sharp_borders_to_ids(SD_df, SD_tree, "first", gap, gap2) | |
ind_arr2 = Check_sharp_borders_to_ids(SD_df, SD_tree, "second", gap, gap2) | |
SD_df2 = SD_df[(ind_arr .== 1) .| (ind_arr2 .== 1), :] # in question & | |
SD_tree2 = Itree_from_DF(SD_df2, "SD_IntTree") | |
return Interval_object(SD_df2, SD_tree2), ind_arr, ind_arr2 | |
end | |
function Itree_merge_overlaps(Itree::Dict, chr::Int64, coor_s::Int64, coor_e::Int64, gap::Int64 = gap_value) | |
Itree[chr] = get(Itree, chr, IntervalTree{Int, IntervalValue{Int, String}}()) | |
over = collect(intersect(Itree[chr], (coor_s - gap, coor_e + gap))) | |
if length(over) == 0 | |
push!(Itree[chr], IntervalValue(coor_s, coor_e, "")) | |
else | |
left_b = minimum(push!(map(x -> x.first, over), coor_s)) | |
right_b = maximum(push!(map(x -> x.last, over), coor_e)) | |
map(x -> delete!(Itree[chr], (x.first, x.last)), over) | |
push!(Itree[chr], IntervalValue(left_b, right_b, "")) | |
end | |
return Itree | |
end | |
function DF_from_Itree(Itree::Dict) | |
Pyrs_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[], ids = String[]) | |
Itree2::Dict{Int64, IntervalTrees.IntervalBTree} = Dict() | |
id_num = 0 | |
for chr in sort(collect(keys(Itree))) | |
for inter in Itree[chr] | |
id_num += 1 | |
push!(Pyrs_df, [chr inter.first inter.last "id"*string(id_num)]) | |
Itree2[chr] = get(Itree2, chr, IntervalTree{Int, IntervalValue{Int, String}}()) | |
push!(Itree2[chr], IntervalValue(inter.first, inter.last, "id"*string(id_num))) | |
end | |
end | |
return Itree2, Pyrs_df | |
end | |
function filter_SEDEF_DF!(a_df) | |
a_df = a_df[a_df[!, :identity] .>= 0.9, :] | |
inds = min.(a_df[!,:coor_e] - a_df[!,:coor_s], a_df[!,:coor_e2] - a_df[!,:coor_s2]) .>= 999 | |
a_df = a_df[inds, :] | |
return a_df | |
end | |
function Initialize_SD_object(filename::String, comp::Int=1, comp2::String="left"; comp3::String="all") | |
DF_prel = Read_SDs_file_to_DF(filename, comp) | |
DF_filt = Filter_SDs_dataframe(DF_prel, comp2) | |
Itree_SDs = Itree_from_DF(DF_filt, "SD_IntTree") | |
SD_obj = Interval_object(DF_filt, Itree_SDs) | |
if comp3 == "filter" | |
SD_obj = Filt_sharp_borders_SDs(SD_obj) | |
end | |
return SD_obj | |
end | |
function Initialize_pyr_object(DF_filt::DataFrame) | |
Itree_prel = Itree_from_DF(DF_filt, "Pyrs_IntTree") | |
Itree_pyrs, DF_pyrs = DF_from_Itree(Itree_prel) | |
Pyr_obj = Interval_object(DF_pyrs, Itree_pyrs) | |
return Pyr_obj | |
end | |
end | |
##################################################################################################### | |
module NetworkAnalysis | |
using LightGraphs, MetaGraphs, IntervalTrees, DataFrames, Main.SDmoduleInit, GraphPlot, StatsBase | |
using Colors, Compose, Plots, Histograms, GLM, Statistics, HypothesisTests, Distributions, ArndtLabTheme | |
using LaTeXStrings | |
pyplot() | |
theme(:arndtlab) | |
export Graph_pars_object, Initialize_SD_network, Plot_SD_network, Node_degree_Comp_size_distr, Stat_tests_SD_net_features, Spliting_network_diff_ways | |
mutable struct Graph_pars_object | |
mg::MetaGraph | |
self_loops::Dict | |
edges_double::Dict | |
edges_type::Dict | |
edges_tandem::Dict | |
end | |
function Extract_fields_from_Graph_object(graph_obj::Graph_pars_object) | |
out_array = [getfield(graph_obj, field) for field in [:mg, :self_loops, :edges_double, :edges_type, :edges_tandem]] | |
return out_array | |
end | |
function Graph_from_DF(int_obj::Interval_object, SD_df::DataFrame) | |
self_loops::Dict = Dict() | |
edges_double::Dict = Dict() | |
edges_type::Dict = Dict() | |
edges_tandem::Dict = Dict() | |
mg = MetaGraph(size(int_obj.int_DF)[1], 1.0) # 1.0 - default weight | |
map(x -> set_prop!(mg, x, :id, int_obj.int_DF[!, :ids][x]), 1:size(int_obj.int_DF)[1]) | |
set_indexing_prop!(mg, :id) | |
for line in eachrow(SD_df) | |
over1 = collect(intersect(int_obj.int_tree[line[:chr]], (line[:coor_s], line[:coor_e]))) | |
over2 = collect(intersect(int_obj.int_tree[line[:chr2]], (line[:coor_s2], line[:coor_e2]))) | |
if length(over1) != 1 || length(over2) != 1 | |
display("Initialize_graph_from_DF warning") | |
end | |
vert1 = mg[over1[1].value, :id] | |
vert2 = mg[over2[1].value, :id] | |
if has_edge(mg, Edge(vert1, vert2)) | |
edges_double[(get_prop(mg, vert1, :id), get_prop(mg, vert2, :id))] = get(edges_double, (get_prop(mg, vert1, :id), get_prop(mg, vert2, :id)), 0) + 1 | |
elseif !has_edge(mg, Edge(vert1, vert2)) && (vert1 != vert2) | |
add_edge!(mg, vert1, vert2) | |
end | |
if vert1 == vert2 | |
self_loops[get_prop(mg, vert1, :id)] = get(self_loops, get_prop(mg, vert1, :id), 0) + 1 | |
else | |
if line[:chr] == line[:chr2] | |
edges_type[(get_prop(mg, vert1, :id), get_prop(mg, vert2, :id))] = get(edges_type, (get_prop(mg, vert1, :id), get_prop(mg, vert2, :id)), 1) | |
if min(abs(line[:coor_s] - line[:coor_e2]), abs(line[:coor_s2] - line[:coor_e])) < 5e5 | |
edges_tandem[(get_prop(mg, vert1, :id), get_prop(mg, vert2, :id))] = get(edges_tandem, (get_prop(mg, vert1, :id), get_prop(mg, vert2, :id)), 1) | |
end | |
else | |
edges_type[(get_prop(mg, vert1, :id), get_prop(mg, vert2, :id))] = get(edges_type, (get_prop(mg, vert1, :id), get_prop(mg, vert2, :id)), 0) | |
end | |
end | |
end | |
graph_obj = Graph_pars_object(mg, self_loops, edges_double, edges_type, edges_tandem) | |
return graph_obj | |
end | |
function Filter_single_nodes_everywhere!(int_obj::Interval_object, graph_obj::Graph_pars_object) | |
mg, self_loops, edges_double, edges_type, edges_tandem = Extract_fields_from_Graph_object(graph_obj) | |
node_single = [e[1] for e in filter(x -> length(x) == 1, connected_components(mg))] | |
id_single = map(x -> get_prop(mg, x, :id), node_single) | |
map(x -> rem_prop!(mg, mg[x, :id], :id), id_single) | |
mg = mg[filter_vertices(mg, :id)] | |
for id in id_single | |
line = int_obj.int_DF[int_obj.int_DF[!, :ids] .== id, :] | |
over = collect(intersect(int_obj.int_tree[line[!, :chr][1]], (line[!, :coor_s][1], line[!, :coor_e][1]))) | |
if length(over) == 1 | |
delete!(int_obj.int_tree[line[!, :chr][1]], (over[1].first, over[1].last)) | |
elseif length(over) > 1 | |
display("Filter_single_nodes_everywhere warning") | |
end | |
int_obj.int_DF = int_obj.int_DF[int_obj.int_DF[!, :ids] .!= id, :] | |
delete!(self_loops, id) | |
end | |
set_indexing_prop!(mg, :id) | |
graph_obj2 = Graph_pars_object(mg, self_loops, edges_double, edges_type, edges_tandem) | |
return graph_obj2 | |
end | |
function Initialize_SD_network(int_obj::Interval_object, SD_df::DataFrame) | |
graph_obj = Graph_from_DF(int_obj, SD_df); | |
graph_obj = Filter_single_nodes_everywhere!(int_obj, graph_obj); | |
mg, self_loops, edges_double, edges_type, edges_tandem = Extract_fields_from_Graph_object(graph_obj); | |
intra_num = reduce(+, [edges_type[i] for i in keys(edges_type)]); | |
inter_num = length(edges_type) - intra_num; | |
display("Double edges = $(length(edges_double)) = $(length(edges_double)/ne(mg)) of all edges") | |
display("Self loops = $(length(self_loops)) = $(length(self_loops)/nv(mg)) of all nodes (or $(404/(nv(mg)+404-259)) without filtering singleton nodes)") | |
display("Tandem edges = $(length(edges_tandem)) = $(length(edges_tandem)/ne(mg)) of all edges and $(length(edges_tandem)/intra_num) of all intra") | |
display("Intra = $intra_num and Inter = $inter_num edges") | |
return graph_obj; | |
end | |
function Plot_SD_network(graph_obj::Graph_pars_object, comp::String = "normal") | |
mg, self_loops, edges_double, edges_type, edges_tandem = Extract_fields_from_Graph_object(graph_obj) | |
if comp == "normal" | |
gp = gplot(mg) | |
draw(PDF("./pics/SD_net.pdf", 120cm, 120cm), gp) | |
elseif comp == "self_loops" | |
verts = map(x -> mg[x, :id], collect(keys(self_loops))) | |
membership = ones(Int, nv(mg)) | |
membership[verts] .+= 1 | |
nodecol = [colorant"lightseagreen", colorant"red"][membership] | |
gp = gplot(mg, nodefillc=nodecol) | |
draw(PDF("./pics/SD_net_self_loops.pdf", 120cm, 120cm), gp) | |
elseif comp == "edges_double" | |
edgs = map(x -> Edge(mg[x[1], :id], mg[x[2], :id]), collect(keys(edges_double))) | |
membership = map(x -> x in edgs ? 2 : 1, collect(edges(mg))) | |
edgecol = [colorant"grey", colorant"red"][membership] | |
gp = gplot(mg, edgestrokec=edgecol) | |
draw(PDF("./pics/SD_net_double_edges.pdf", 120cm, 120cm), gp) | |
elseif comp == "edges_tandem" | |
edgs = map(x -> Edge(mg[x[1], :id], mg[x[2], :id]), collect(keys(edges_tandem))) | |
membership = map(x -> x in edgs ? 2 : 1, collect(edges(mg))) | |
edgecol = [colorant"grey", colorant"red"][membership] | |
gp = gplot(mg, edgestrokec=edgecol) | |
draw(PDF("./pics/SD_net_tandem_edges.pdf", 120cm, 120cm), gp) elseif comp == "edges_intra" | |
edgs = map(x -> edges_type[x] == 1 ? Edge(mg[x[1], :id], mg[x[2], :id]) : 0, collect(keys(edges_type))) | |
edgs = edgs[edgs .!= 0] | |
membership = map(x -> x in edgs ? 2 : 1, collect(edges(mg))) | |
edgecol = [colorant"grey", colorant"red"][membership] | |
gp = gplot(mg, edgestrokec=edgecol) | |
draw(PDF("./pics/SD_net_intra_edges.pdf", 120cm, 120cm), gp) | |
elseif comp == "both_selfloops_intra" | |
edgs = map(x -> edges_type[x] == 1 ? Edge(mg[x[1], :id], mg[x[2], :id]) : 0, collect(keys(edges_type))) | |
edgs = edgs[edgs .!= 0] | |
membership = map(x -> x in edgs ? 2 : 1, collect(edges(mg))) | |
edgecol = [colorant"grey", colorant"red"][membership] | |
verts = map(x -> mg[x, :id], collect(keys(self_loops))) | |
membership = ones(Int, nv(mg)) | |
membership[verts] .+= 1 | |
nodecol = [colorant"lightseagreen", colorant"red"][membership] | |
gp = gplot(mg, edgestrokec=edgecol, nodefillc=nodecol); | |
draw(PDF("./pics/SD_both_selfloops_intra.pdf", 120cm, 120cm), gp); | |
end | |
end | |
function Node_degree_Comp_size_distr(mgs::Array, legends::Array{String,1}, comp=nothing; cc_bin::Int=20, nd_bin::Int=20, thr::Int=5, max_x::Int64=13000) | |
colors = ArndtLabTheme.arndtlab_palette | |
colors2 = copy(colors) | |
colors2[4], colors2[5] = colors[5], colors[4] | |
colors2[5], colors2[6] = colors[6], colors[4] | |
markers = [:circle, :circle, :circle, :circle, :xcross, :circle, :circle, :circle, :circle] | |
m_sizes = [6, 6, 6, 6, 5, 6, 6, 6, 6] | |
h = Histograms.Histogram([UInt64(30000)], n=cc_bin, scale=:log) | |
cc_p, nd_p, rat_p, rat_m_p = plot(), plot(), plot(), plot() | |
for (i, mg) in enumerate(mgs) | |
ccs = collect(connected_components(mg)) | |
cc_len = map(length, ccs); | |
ucc_len = convert(Array{UInt,1}, cc_len) | |
cc_h = Histograms.Histogram(ucc_len, breaks=h.breaks, scale=:log); | |
plot!(cc_p, cc_h, xscale=:log10, yscale=:log10, markershape=markers[i], color=colors2[i], | |
label=legends[i], line=false, markersize=m_sizes[i], xtickfont=Plots.font(13), | |
ytickfont=Plots.font(13), xlim=[1, max_x], legendfontsize=12); | |
x = range(2, 1800, length=10); y = 1.5*x.^(-2.7); | |
plot!(cc_p, x, y, label="", grid=false, color=colors2[6]) | |
if comp != nothing | |
savefig(comp) | |
end | |
n_deg = map(x -> length(neighbors(mg, x)), collect(vertices(mg))); | |
n_deg = convert.(Float64, n_deg); | |
nd_h = Histograms.Histogram(n_deg, N=nd_bin, scale=:identity); | |
plot!(nd_p, nd_h, xscale=:identity, yscale=:log10, markershape=markers[i], color=colors2[i], legend=false, grid=false, line=false, markersize=m_sizes[i], xtickfont=Plots.font(13), ytickfont=Plots.font(13)); | |
#nd_p = plot(nd_h, yscale=:log10, xscale=:log10, markershape=:auto, legend=false, grid=false, line=false, xlim=[1,170]); | |
x = range(1, 150, length=10); y = 0.01*exp.(-0.056x); | |
plot!(nd_p, x, y, legend=false, grid=false, color=colors2[6]) | |
#savefig("./pics/node_degree_lin_SD.pdf") | |
edgs::Array{Int64,1} = [] | |
for cc in ccs | |
push!(edgs, sum([length(neighbors(mg, node)) for node in cc])/2) | |
end | |
plot!(rat_p, cc_len, edgs, xscale=:log10, yscale=:log10, color=colors2[i], markersize=m_sizes[i], markershape=markers[i], line=false, label="", grid=false, xtickfont=Plots.font(13), ytickfont=Plots.font(13)) | |
b, k, rat_p = Regression_from_2_arrays(cc_len, edgs, ((cc_len .> thr)), rat_p, [1, 2000], comp="log", color=colors2[i]) | |
m_edgs = [mean(edgs[cc_len .== vals]) for vals in sort(unique(cc_len))] | |
m_cc_len = sort(unique(cc_len)) | |
plot!(rat_m_p, m_cc_len, m_edgs, xscale=:log10, yscale=:log10, color=colors2[i], markershape=markers[i], line=false, grid=false, label="", markersize=m_sizes[i], xtickfont=Plots.font(13), ytickfont=Plots.font(13)) | |
xs = range(1, 2000, length=10); | |
ys = (10^b)*xs.^k; | |
plot!(rat_m_p, xs, ys, label="α = $k", color=colors2[i]) | |
#b, k, rat_m_p = Regression_from_2_arrays(m_cc_len, m_edgs, ((m_cc_len .> 5)), rat_m_p, [1, 2000], comp="log", color=colors2[i]) | |
#savefig("./pics/comps_nodes_edges_SD.pdf") | |
end | |
display(cc_p) | |
#savefig(cc_p, "./pics/animals_cc_distr.pdf") | |
display(nd_p) | |
#savefig(nd_p, "./pics/animals_nd_distr.pdf") | |
display(rat_p) | |
gui(rat_m_p) | |
#savefig(rat_m_p, "./pics/animals_m_n_e_distr.pdf") | |
end | |
function Regression_from_2_arrays(arrX::Array, arrY::Array, cond::BitArray{1}, plotobj::Plots.Plot, xlim::Array; comp::String = "linear", color=:red) | |
arrX = arrX[cond]; arrY = arrY[cond] | |
if comp == "linear" | |
data = DataFrame(X=arrX, Y=arrY) | |
elseif comp == "log" | |
data = DataFrame(X=log10.(arrX), Y=log10.(arrY)) | |
end | |
slm = lm(@formula(Y ~ X), data) | |
b, k = coef(slm) | |
x = range(xlim[1], xlim[2], length=10); | |
if comp == "linear" | |
y = k*x .+ b; | |
elseif comp == "log" | |
y = (10^b)*x.^k; | |
end | |
k_round = round(k, digits=2) | |
plot!(plotobj, x, y, label="α = $k_round", color=color) | |
display(k) | |
return b, k_round, plotobj | |
end | |
function Stat_tests_SD_net_features(graph_obj::Graph_pars_object, limits::Array{Int64,1}=[10, 1000]) | |
mg, self_loops, edges_double, edges_type, edges_tandem = Extract_fields_from_Graph_object(graph_obj) | |
big_nodes::Array{String,1} = [] | |
other_nodes::Array{String,1} = [] | |
middle_ccs::Array{Array{Int64,1},1} = [] | |
middle_intra::Array{Int64, 1} = [] | |
middle_lens::Array{Int64, 1} = [] | |
ccs = collect(connected_components(mg)) | |
bc_size = maximum(map(length, ccs)) | |
map(x -> length(x) == bc_size ? append!(big_nodes, Node_array_to_back_ids(mg, x)) : append!(other_nodes, Node_array_to_back_ids(mg, x)), ccs) | |
f_plus, f_minus = 0, 0 | |
map(x -> x in big_nodes ? f_plus += 1 : f_minus += 1, collect(keys(self_loops))) | |
pv_self = pvalue(FisherExactTest(f_plus, f_minus, length(big_nodes), length(other_nodes))) | |
display([f_plus/length(big_nodes) f_minus/length(other_nodes)]) | |
display("Fisher test p-value for self_loops in big/other components = $pv_self") | |
f_plus, f_minus = 0, 0 | |
arr_intra = collect(filter(x -> edges_type[x] == 1, collect(keys(edges_type)))) | |
map(x -> (x[1] in big_nodes) && (x[2] in big_nodes) ? f_plus += 1 : f_minus += 1, arr_intra) | |
ne_big = sum(map(x -> neighbors(mg, mg[x, :id]) |> length, big_nodes))/2 | |
ne_other = ne(mg) - ne_big | |
pv_intra = pvalue(FisherExactTest(f_plus, f_minus, Int(ne_big), Int(ne_other))) | |
display([f_plus/Int(ne_big) f_minus/Int(ne_other)]) | |
display("Fisher test p-value for intra_edges in big/other components = $pv_intra") | |
f_plus, f_minus = 0, 0 | |
map(x -> (x[1] in big_nodes) && (x[2] in big_nodes) ? f_plus += 1 : f_minus += 1, collect(keys(edges_double))) | |
ne_big = sum(map(x -> neighbors(mg, mg[x, :id]) |> length, big_nodes))/2 | |
ne_other = ne(mg) - ne_big | |
pv_double = pvalue(FisherExactTest(f_plus, f_minus, Int(ne_big), Int(ne_other))) | |
display([f_plus/Int(ne_big) f_minus/Int(ne_other)]) | |
display("Fisher test p-value for double_edges in big/other components = $pv_double") | |
middle_ccs = filter(x -> (length(x) >= limits[1]) && (length(x) <= limits[2]) , ccs) | |
for comp in middle_ccs | |
comp_intra = 0 | |
gr = induced_subgraph(mg, comp)[1] | |
set_indexing_prop!(gr, :id) | |
push!(middle_lens, ne(gr)) | |
for edg in edges(gr) | |
edge = (gr[edg.src, :id], gr[edg.dst, :id]) | |
comp_intra += edges_type[edge] | |
end | |
push!(middle_intra, comp_intra) | |
end | |
chi_val = Calculate_Chi_square(middle_lens, middle_intra) | |
simul_chi_vals = Simul_intra_distribution(middle_lens, sum(middle_intra)) | |
#display(middle_intra./middle_lens) | |
num_more = count(x -> x >= chi_val, simul_chi_vals) | |
display("There are $num_more simulations with bigger intra- fraction value than observed in middle size components out of $(length(simul_chi_vals))") | |
end | |
function Node_array_to_back_ids(mg::MetaGraph, node_arr::Array) | |
[mg[i, :id] for i in node_arr] | |
end | |
function Calculate_Chi_square(ns::Array{Int64,1}, ks::Array{Int64,1}) | |
p = sum(ks)/sum(ns) | |
sum(((p*ns .- ks).^2)./(p*ns)) | |
end | |
function Simul_intra_distribution(ns::Array{Int64,1}, num::Int64, iter::Int64=1000) | |
vals::Array{Float64,1} = [] | |
prs = ns/sum(ns) | |
distr = Multinomial(num, prs) | |
for i in 1:iter | |
push!(vals, Calculate_Chi_square(ns, rand(distr))) | |
end | |
return vals | |
end | |
function Spliting_network_diff_ways(graph_obj::Graph_pars_object, comp::String) | |
mg, self_loops, edges_double, edges_type, edges_tandem = Extract_fields_from_Graph_object(graph_obj) | |
cc = connected_components(mg) | |
lens = map(length, cc) | |
big_nodes = Node_array_to_back_ids(mg, cc[lens .== maximum(lens)][1]) | |
if comp == "intra" || comp == "intra_nobig" | |
arr_intra = collect(filter(x -> edges_type[x] == 1, collect(keys(edges_type)))) | |
edges_intra = map(x -> Edge(mg[x[1], :id], mg[x[2], :id]), arr_intra) | |
gr = induced_subgraph(mg, edges_intra)[1] | |
set_indexing_prop!(gr, :id) | |
if comp == "intra_nobig" | |
node_arr = filter(ver -> !(gr[ver, :id] in big_nodes), vertices(gr)) | |
gr = induced_subgraph(gr, node_arr)[1] | |
set_indexing_prop!(gr, :id) | |
end | |
return gr | |
elseif comp == "inter" || comp == "inter_nobig" | |
arr_inter = collect(filter(x -> edges_type[x] == 0, collect(keys(edges_type)))) | |
edges_inter = map(x -> Edge(mg[x[1], :id], mg[x[2], :id]), arr_inter) | |
gr = induced_subgraph(mg, edges_inter)[1] | |
set_indexing_prop!(gr, :id) | |
if comp == "inter_nobig" | |
node_arr = filter(ver -> !(gr[ver, :id] in big_nodes), vertices(gr)) | |
gr = induced_subgraph(gr, node_arr)[1] | |
set_indexing_prop!(gr, :id) | |
end | |
return gr | |
end | |
end | |
end | |
########################################################################################################### | |
module PyrSDTypes | |
using StatsBase, Statistics, DataFrames, Histograms, Plots, MetaGraphs, LightGraphs, StatsPlots, GLM, Main.SDmoduleInit, Main.NetworkAnalysis, DecisionTree, Random, ArndtLabTheme | |
pyplot() | |
theme(:arndtlab) | |
export Real_SD_length_distr, Pyr_length_node_degree!, Modify_DF_for_RandForset! | |
function Real_SD_length_distr(Itree_SDs::Dict, Itree_pyrs::Dict, sds_df::DataFrame) | |
real_SDs::Array{String, 1} = [] | |
for chr in keys(Itree_SDs) | |
sing_SD = filter(x -> length(collect(intersect(Itree_SDs[chr], x))) == 1, collect(Itree_SDs[chr])) | |
append!(real_SDs, [i.value for i in sing_SD]) | |
end | |
real_SDs_df = sds_df[map(x -> x in real_SDs, sds_df[!, :ids]),:] | |
#real_SDs_df = real_SDs_df[0.98 .< real_SDs_df[!, :identity] .< 0.9999, :] | |
lens::Array{Float64,1} = real_SDs_df[!, :coor_e] - real_SDs_df[!, :coor_s] | |
cc_h = Histograms.Histogram(lens, N=30, scale=:log10); | |
cc_p = plot(cc_h, xscale=:log10, yscale=:log10, markershape=:auto, line=false); | |
x = range(10^3, 10^5.5, length=10); y = 900*x.^(-2); | |
cc_p = plot!(x, y, label="a = -2") | |
display(mean(lens)) | |
display(cc_p) | |
end | |
function Pyr_length_node_degree!(pyrs_df::DataFrame, mg::MetaGraph) | |
node_len::Dict = Dict() | |
pyrs_df[!, :degree] = zeros(size(pyrs_df)[1]) | |
pyrs_df[!, :length] = convert.(Float64, pyrs_df[!, :coor_e] .- pyrs_df[!, :coor_s]) | |
for i in vertices(mg) | |
neis = convert(Float64, length(neighbors(mg, i))) | |
len = pyrs_df[pyrs_df[!, :ids] .== get_prop(mg, i, :id), :length][1] | |
pyrs_df[pyrs_df[!, :ids] .== get_prop(mg, i, :id), :degree] = neis | |
node_len[neis] = push!(get(node_len, neis, Float64[]), len) | |
end | |
nd = sort(collect(keys(node_len))) | |
mlen = map(x -> mean(node_len[x]), nd) | |
p = plot(nd[nd .< 30], mlen[nd .< 30], markershape=:auto, line=false, legend=false, markersize=6, xtickfont=Plots.font(13), xticks=[0,5,10,15,20,25,30], xlim=[0,30], ytickfont=Plots.font(13)) | |
p = plot!(nd[nd .< 30], 3625 .+ 3700*nd[nd .< 30]) | |
display(p) | |
#savefig("./pics/length_nd_SD.pdf") | |
p = plot(nd, mlen, markershape=:auto, line=false) | |
p = plot!(nd[nd .< 30], 3625 .+ 3700*nd[nd .< 30]) | |
display(p) | |
q = plot(log10.(pyrs_df[!, :degree]), log10.(pyrs_df[!, :length]), markershape=:auto, line=false, markersize = 1.0) | |
gui(q) | |
end | |
function Pyr_nd_bins_length(pyrs_df::DataFrame, fr_starts::Array, fr_ends::Array) | |
low_freq_len = pyrs_df[fr_starts[1] .<= pyrs_df[:degree] .<= fr_ends[1] , :length] | |
mid_freq_len = pyrs_df[fr_starts[2] .<= pyrs_df[:degree] .<= fr_ends[2] , :length] | |
high_freq_len = pyrs_df[fr_starts[3] .<= pyrs_df[:degree] .<= fr_ends[3] , :length] | |
#p = histogram(low_freq_len, bins=:scott, weights=repeat(1:5, outer=200)) | |
end | |
function Modify_DF_for_RandForset!(a::Interval_object, b::Interval_object, segs_int::Interval_object, graph_obj::Graph_pars_object, mean_covs::Array{Float64,1}) | |
df = DataFrame(length=Float64[], self=Int64[], double=Float64[], degree=Float64[], comp_s=Float64[], mean_cov=Float64[], breaks = Int64[], max_cov = Int64[], intra_frac = Float64[], max_len = Int64[]) | |
d_e = Dict() | |
breaks = Dict() | |
ls = Int64[] | |
b.int_DF[!, :mean_cov] = mean_covs | |
b.int_DF[!, :comp_size] = zeros(size(b.int_DF)[1]) | |
ccs = connected_components(graph_obj.mg) | |
for com in ccs | |
len = length(com) | |
map(x -> get_prop(graph_obj.mg, x, :id) |> x -> b.int_DF[b.int_DF[!, :ids] .== x, :comp_size] = len, com) | |
end | |
for i in keys(graph_obj.edges_double) | |
d_e[i[1]] = get(d_e, i[1], 0) + graph_obj.edges_double[i] | |
d_e[i[2]] = get(d_e, i[2], 0) + graph_obj.edges_double[i] | |
end | |
for i in keys(b.int_tree) | |
for j in collect(intersect(b.int_tree[i], a.int_tree[i])) | |
breaks[j[1].value] = get(breaks, j[1].value, reshape(Int64[], (0, 2))) | |
breaks[j[1].value] = vcat(breaks[j[1].value], [j[2].first j[2].last]) | |
end | |
end | |
for i in keys(breaks) | |
l = 0 | |
for j in 0:10 | |
l += length(unique(breaks[i][map(x -> x + j - 5 in breaks[i][:,2], breaks[i][:,1]),2])) | |
end | |
push!(ls, l) | |
end | |
b.int_DF[!, :pyr_breaks] = ls | |
for i in eachrow(b.int_DF) | |
deg = i[:degree] | |
len = i[:length] | |
mcov = i[:mean_cov] | |
brks = i[:pyr_breaks] | |
comp_s = i[:comp_size] | |
self = get(graph_obj.self_loops, i[:ids], 0) | |
double = get(d_e, i[:ids], 0) | |
sds = collect(intersect(a.int_tree[i[:chr]], (i[:coor_s], i[:coor_e]))) | |
max_len = maximum(map(x -> x.last - x.first + 1, sds)) | |
intra_frac = mean(map(x -> i[:chr] == a.int_DF[a.int_DF[!, :ids] .== x.value, :chr2][1] ? 1.0 : 0.0, sds)) | |
segs = collect(intersect(segs_int.int_tree[i[:chr]], (i[:coor_s], i[:coor_e]))) | |
max_cov = maximum(map(x -> x.value, segs)) | |
push!(df, [len self double deg comp_s mcov brks max_cov intra_frac max_len]) | |
end | |
b.int_DF[!, :intra_frac] = df[:, :intra_frac] | |
lms = lm(@formula(length ~ self + double + degree + comp_s + mean_cov + breaks + max_cov + intra_frac), df) | |
display(r2(lms)) | |
display(lms) | |
return df | |
end | |
end | |
############################################################################################## | |
module SegmentBlocks | |
using IntervalTrees, Main.SDmoduleInit, DataFrames, StatsBase, Statistics, GenomicFeatures, IntervalSets, Histograms, Plots, ArndtLabTheme | |
pyplot() | |
theme(:arndtlab) | |
export pyramid_coverage_calc, plot_segments_distribution, itree_from_df | |
function find_segments_from_it(int_SDs::Interval_object, int_pyrs::Interval_object) | |
dic_starts, dic_ends = Dict(), Dict() | |
for chr in keys(int_pyrs.int_tree) | |
for it in int_pyrs.int_tree[chr] | |
sds_set = intersect(int_SDs.int_tree[chr], it) | |
for sd in sds_set | |
dic_starts[chr] = get(dic_starts, chr, Int64[]) | |
dic_ends[chr] = get(dic_ends, chr, Int64[]) | |
push!(dic_starts[chr], sd.first) | |
push!(dic_ends[chr], sd.last) | |
end | |
end | |
end | |
out_df = find_segments_second(dic_starts, dic_ends) | |
itree = itree_from_df(out_df) | |
return Interval_object(out_df, itree) | |
end | |
function find_segments_second(dic_starts::Dict, dic_ends::Dict) | |
out_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[], length = Int64[], cover = Int64[]) | |
map(x -> sort!(dic_starts[x]), collect(keys(dic_starts))) | |
map(x -> sort!(dic_ends[x]), collect(keys(dic_ends))) | |
for chr in keys(dic_starts) | |
st_ind, en_ind, cover = 1, 1, 0 | |
covers = Int64[] | |
push!(dic_starts[chr], 5e10) | |
push!(dic_ends[chr], 5e10) | |
while en_ind <= length(dic_ends[chr]) - 1 | |
if dic_starts[chr][st_ind] < dic_ends[chr][en_ind] | |
cover += 1 | |
st_ind += 1 | |
elseif dic_starts[chr][st_ind] >= dic_ends[chr][en_ind] | |
cover -= 1 | |
en_ind += 1 | |
end | |
push!(covers, cover) | |
end | |
coors = sort(append!(dic_starts[chr], dic_ends[chr])) | |
for i in 1:(length(coors) - 3) # because added 2 big values; | |
push!(out_df, (chr, coors[i], coors[i + 1], coors[i + 1] - coors[i], covers[i])) | |
end | |
end | |
out_df = out_df[(out_df[!, :length] .!= 0) .& (out_df[!, :cover] .!= 0), :] | |
return out_df | |
end | |
function itree_from_df(cnvs_df::DataFrame, comp::String = "cover") | |
Itree::Dict{Int64, IntervalTrees.IntervalBTree} = Dict() | |
for line in eachrow(cnvs_df) | |
Itree[line[:chr]] = get(Itree, line[:chr], IntervalTree{Int, IntervalValue{Int, Int}}()) | |
if comp == "cover" | |
push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], line[:cover])) | |
elseif comp == "nothing" | |
push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], 0)) | |
end | |
end | |
return Itree | |
end | |
function pyramid_coverage_calc(int_pyrs::Interval_object, int_sds::Interval_object) | |
mean_covs = Float64[] | |
all_sds::Array{GenomicFeatures.Interval, 1} = [] | |
for line in eachrow(int_pyrs.int_DF) | |
len = line[:length] | |
chr, coor1, coor2 = line[:chr], line[:coor_s], line[:coor_e] | |
sds = collect(intersect(int_sds.int_tree[chr], (coor1, coor2))) | |
sds2 = [IntervalSets.intersect(coor1..coor2, i.first..i.last) for i in sds] | |
sds3 = [(i.left - coor1)/len..(i.right - coor1)/len for i in sds2] | |
sds4 = [GenomicFeatures.Interval("1", 1 + floor(Int, 10000*i.left), 1 + floor(Int, 10000*i.right)) for i in sds3] | |
if length(sds4) > 0 | |
append!(all_sds, sds4) | |
cov_loc = coverage(sort(sds4)) | |
mean_cov = sum(map(x -> (rightposition(x) - leftposition(x) + 1)*metadata(x), cov_loc))/10001 | |
push!(mean_covs, mean_cov) | |
else | |
push!(mean_covs, 0.0) | |
end | |
end | |
covers = coverage(sort(all_sds)) | |
return mean_covs, covers | |
end | |
function plot_segments_distribution(a::Interval_object, b::Interval_object; len_thr::Int64=1000) | |
segs_int = SegmentBlocks.find_segments_from_it(a, b) | |
sf = segs_int.int_DF[segs_int.int_DF[!, :length] .> len_thr, :] | |
cc_h = Histograms.Histogram(collect(sf[!, :cover]), N=40, scale=:log10) | |
cc_p = plot(cc_h, xscale=:log10, yscale=:log10, markershape=:auto, line=false) | |
x = range(1, 100, length=10); y = 2*x.^(-2); | |
cc_p = plot!(x, y) | |
gui(cc_p) | |
display("Only long segments genome fraction:") | |
display(sum(segs_int.int_DF[segs_int.int_DF[!, :length] .> len_thr, :length])/2.88e9) | |
display("All segments genome fraction:") | |
display(sum(segs_int.int_DF[!, :length])/2.88e9) | |
return segs_int | |
end | |
end | |
############################################################################################### | |
module CNVspart | |
using DataFrames, Main.SDmoduleInit, IntervalTrees, IntervalSets, StatsBase, GenomicFeatures, Plots, ArndtLabTheme | |
pyplot() | |
theme(:arndtlab) | |
export read_CNVs_to_DF, shuffle_DF_to_Intobject, overlap_CNVs_pyrs!, CNV_rare_fixed_overlap, find_gene_overlap!, itree_from_df | |
function read_CNVs_to_DF(file_SDs::String = "../CNV_pop_cancer/CNVs_only_popul_hg38.txt", ac_thr::Array{Int64,1} = [1, 2504], samplesize::Int64=5008) | |
cnvs_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[], ac = Int64[]) | |
open(file_SDs) do sdfile | |
lines = readlines(sdfile) | |
acs = [] | |
for l in lines | |
a = split(l) | |
cn_vals = [x.match for x in eachmatch(r"<CN[0-9]+>", a[5])] | |
m = match(r"AC=([0-9,]+);", a[8]) | |
ac_vals = parse.(Int64, split(m.captures[1], ",")) | |
ac = sum(ac_vals[cn_vals .!= "<CN0>"]) | |
push!(acs, ac <= samplesize/2 ? ac : samplesize - ac) | |
end | |
#acs = map(x -> strip(x) |> x -> split(x)[8] |> x -> split(x, r",|;")[1] |> x -> split(x, "=")[2] |> x -> parse(Int64, x) |> x -> x <= samplesize/2 ? x : samplesize - x, lines) | |
chrs = map(x -> split(x)[1], lines) | |
coor_s = map(x -> split(x)[2] |> x -> parse(Int64, x), lines) | |
coor_e = map(x -> split(x)[8] |> x -> match(r"END=[0-9]+", x).match |> x -> split(x, "=")[2] |> x -> parse(Int64, x), lines) | |
chrs = map(x -> x in ["X", "Y", "M"] ? 0 : parse(Int64, x), chrs) | |
[push!(cnvs_df, [chrs[i] coor_s[i] coor_e[i] acs[i]]) for i in 1:length(chrs)]; | |
cnvs_df = cnvs_df[(cnvs_df[!, :chr] .!== 0) .& (ac_thr[1] .<= cnvs_df[!, :ac] .<= ac_thr[2]), :] | |
end | |
Itree = itree_from_df(cnvs_df) | |
return Interval_object(cnvs_df, Itree) | |
end | |
function overlap_CNVs_pyrs!(cnv_int::Interval_object, pyrs_int::Interval_object, comp::String = "real"; pnum::Int64 = 1000) | |
if comp == "real" | |
pyrs_int.int_DF[!, :cnv_over] = zeros(Int64, size(pyrs_int.int_DF)[1]) | |
elseif comp == "shuffle" | |
pyrs_int.int_DF[!, :shuff_over] = zeros(Int64, size(pyrs_int.int_DF)[1]) | |
elseif comp == "perm_over" | |
pyrs_int.int_DF[!, :cnv_over] = zeros(Int64, size(pyrs_int.int_DF)[1]) | |
pyrs_int.int_DF[!, :over_len] = zeros(Float64, size(pyrs_int.int_DF)[1]) | |
pyrs_int.int_DF[!, :nor_left] = zeros(Float64, size(pyrs_int.int_DF)[1]) | |
pyrs_int.int_DF[!, :nor_right] = zeros(Float64, size(pyrs_int.int_DF)[1]) | |
perm_overs = reshape(Float64[], (0, pnum)) | |
elseif comp == "shuffle2" | |
pyrs_int.int_DF[!, :shuff_over] = zeros(Int64, size(pyrs_int.int_DF)[1]) | |
pyrs_int.int_DF[!, :nor_left] = zeros(Float64, size(pyrs_int.int_DF)[1]) | |
pyrs_int.int_DF[!, :nor_right] = zeros(Float64, size(pyrs_int.int_DF)[1]) | |
end | |
for k in keys(pyrs_int.int_tree) | |
if k in keys(cnv_int.int_tree) | |
for inter in intersect(pyrs_int.int_tree[k], cnv_int.int_tree[k]) | |
if comp == "real" | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :cnv_over] = pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :cnv_over][1] + 1 | |
elseif comp == "shuffle" | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :shuff_over] = pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :shuff_over][1] + 1 | |
elseif comp == "perm_over" | |
l_over, lens_exp, nor_left, nor_right = calc_overlap_fraction_predict(inter[1].first..inter[1].last, inter[2].first..inter[2].last, pnum) | |
if pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :cnv_over][1] == 0 | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_left] = nor_left | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_right] = nor_right | |
else | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_left] = 0.0 | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_right] = 1.0 | |
end | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :cnv_over] = pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :cnv_over][1] + 1 | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :over_len] = pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :over_len][1] + l_over | |
perm_overs = vcat(perm_overs, lens_exp') | |
elseif comp == "shuffle2" | |
l_over, lens_exp, nor_left, nor_right = calc_overlap_fraction_predict(inter[1].first..inter[1].last, inter[2].first..inter[2].last, 1) | |
if pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :shuff_over][1] == 0 | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_left] = nor_left | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_right] = nor_right | |
else | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_left] = 0.0 | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_right] = 1.0 | |
end | |
pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :shuff_over] = pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :shuff_over][1] + 1 | |
end | |
end | |
end | |
end | |
if comp == "perm_over" | |
sums = [sum(perm_overs[!, i]) for i in 1:size(perm_overs)[2]] | |
pval = countmap(sums .> sum(pyrs_int.int_DF[!, :over_len]))[:true]/pnum | |
display("p-value = $(pval) it means nothing") | |
end | |
if comp == "shuffle2" || comp == "perm_over" | |
only_over = pyrs_int.int_DF[pyrs_int.int_DF[!, :nor_right] - pyrs_int.int_DF[!, :nor_left] .> 0.0, :] | |
ints = [GenomicFeatures.Interval("1", 1 + floor(Int, 10000*i[:, :nor_left]), 1 + floor(Int, 10000*i[:, :nor_right])) for i in eachrow(only_over)] | |
covers = coverage(sort(ints)) | |
return covers, sort(ints) | |
end | |
end | |
function shuffle_DF_to_Intobject(in_df::DataFrame, chr_file::String = "chrom.size.hg38") | |
ch_size = Dict() | |
df_sim = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[]) | |
open(chr_file) do chr_f | |
lines = readlines(chr_f); | |
map(x -> strip(x) |> split |> x -> ch_size[parse(Int64, strip(x[1], ['c', 'h', 'r']))] = parse(Int64, x[2]), lines) | |
end | |
for line in eachrow(in_df) | |
lensd = 1 + Int(line[:coor_e] - line[:coor_s]) | |
stpos = rand(1:(ch_size[line[:chr]] - lensd), 1)[1] | |
enpos = stpos + lensd | |
chr = line[:chr] | |
push!(df_sim, [chr stpos enpos]) | |
end | |
Itree = itree_from_df(df_sim, "no_id") | |
return Interval_object(df_sim, Itree) | |
end | |
function itree_from_df(cnvs_df::DataFrame, comp::String = "ac_id") | |
Itree::Dict{Int64, IntervalTrees.IntervalBTree} = Dict() | |
for line in eachrow(cnvs_df) | |
if comp == "ac_id" | |
Itree[line[:chr]] = get(Itree, line[:chr], IntervalTree{Int, IntervalValue{Int, Int}}()) | |
push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], line[:ac])) | |
elseif comp == "no_id" | |
Itree[line[:chr]] = get(Itree, line[:chr], IntervalTree{Int, IntervalValue{Int, Int}}()) | |
push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], 0)) | |
else comp == "id_id" | |
Itree[line[:chr]] = get(Itree, line[:chr], IntervalTree{Int, IntervalValue{Int, String}}()) | |
push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], line[:ids])) | |
end | |
end | |
return Itree | |
end | |
function calc_overlap_fraction_predict(pyr::IntervalSets.Interval, cnv::IntervalSets.Interval, pnum::Int64) | |
l1, l2 = duration(pyr), duration(cnv) | |
over = IntervalSets.intersect(pyr, cnv) | |
nor_left = (over.left - pyr.left)/l1 | |
nor_right = 1.0 - (pyr.right - over.right)/l1 | |
l_over = duration(over) | |
lens_exp::Array{Float64,1} = [] | |
for i in 1:pnum | |
p1 = rand(1:(l1+l2)) | |
p2 = p1 + l1 | |
l_perm = float(duration(IntervalSets.intersect(p1..p2, l1..(l1+l2)))) | |
push!(lens_exp, l_perm) | |
end | |
return float(l_over), lens_exp, nor_left, nor_right | |
end | |
function plot_coverage(covers, num::Int64 = 1) | |
xs, ys = [], [] | |
for i in covers | |
append!(xs, [leftposition(i), rightposition(i)]) | |
append!(ys, [metadata(i), metadata(i)]) | |
end | |
p = plot(xs, ys/num) | |
return p | |
end | |
function CNV_rare_fixed_overlap(b::Interval_object, CNV_int::Interval_object, comp::String = "norm", comp2::String = "both", n::Int64 = 100) | |
if comp2 == "both" | |
piece = deepcopy(b) | |
elseif comp2 == "gene" | |
piece_df = deepcopy(b.int_DF[b.int_DF[!, :gene_over] .== 1.0, :]) | |
piece_tree = itree_from_df(piece_df, "id_id") | |
piece = Interval_object(piece_df, piece_tree) | |
elseif comp2 == "nogene" | |
piece_df = deepcopy(b.int_DF[b.int_DF[!, :gene_over] .== 0.0, :]) | |
piece_tree = itree_from_df(piece_df, "id_id") | |
piece = Interval_object(piece_df, piece_tree) | |
end | |
mer = zeros(size(piece.int_DF)[1]) | |
bins = Int64[1 1; 2 5; 6 200] | |
overs_df = DataFrame(low_f = Float64[], middle_f = Float64[], high_f = Float64[]) | |
if comp == "shuff" | |
for i in 1:n | |
cnv_shuff = CNVspart.shuffle_DF_to_Intobject(CNV_int.int_DF) | |
CNVspart.overlap_CNVs_pyrs!(cnv_shuff, piece, "shuffle") | |
mer .+= piece.int_DF[!, :shuff_over] | |
push!(overs_df, CNVspart.binning_lin_frequency(piece, bins, :shuff_over)) | |
end | |
mer = mer/n | |
# b.int_DF[!, :shuff_over] = mer | |
elseif comp == "norm" | |
push!(overs_df, CNVspart.binning_lin_frequency(piece, bins, :cnv_over)) | |
end | |
return overs_df | |
end | |
function binning_lin_frequency(b::Interval_object, bins::Array{Int64,2}, col) | |
c_mer = Float64[] | |
for i in eachrow(bins) | |
spl = b.int_DF[i[1] .<= b.int_DF[!, :degree] .<= i[2], :] | |
push!(c_mer, sum(spl[!, col])/size(spl)[1]) | |
end | |
return c_mer | |
end | |
function find_gene_overlap!(any_int::Interval_object, filename::String) | |
gene_int = read_gene_file(filename) | |
any_int.int_DF[!, :gene_over] = zeros(size(any_int.int_DF)[1]) | |
for row in eachrow(any_int.int_DF) | |
chr, coor_s, coor_e = row[:chr], row[:coor_s], row[:coor_e] | |
over = collect(intersect(gene_int.int_tree[chr], (coor_s, coor_e))) | |
if length(over) > 0 | |
row[:gene_over] = 1.0 | |
end | |
end | |
end | |
function read_gene_file(filename::String) | |
genes_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[]) | |
open(filename) do genefile | |
lines = readlines(genefile) | |
chrs = map(x -> split(x)[1], lines) | |
coors = map(x -> split(x)[2:3] |> x -> parse.(Int64, x), lines) | |
chrs = map(x -> replace(x, r"chr" => "") |> x -> x in ["X", "Y"] ? 0 : parse(Int64, x), chrs) | |
[push!(genes_df, [chrs[i] coors[i][1] coors[i][2]]) for i in 1:length(chrs)]; | |
genes_df = genes_df[genes_df[!, :chr] .!== 0,:] | |
end | |
Itree = itree_from_df(genes_df, "no_id") | |
return Interval_object(genes_df, Itree) | |
end | |
end | |
module CNVs_TCGA | |
using DataFrames, Main.SDmoduleInit, Main.CNVspart, IntervalTrees, IntervalSets, StatsBase | |
export read_CNVs_TCGA | |
function read_CNVs_TCGA(file_CNVs::String = "../TCGA_CNV_data/CNVs_BRCA_all.txt", logthr::Float64 = 0.3) | |
cnvs_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[], copylog = Float64[]) | |
open(file_CNVs) do cnvfile | |
lines = readlines(cnvfile) | |
copylog = map(x -> split(x)[6] |> x -> parse(Float64, x), lines) | |
chrs = map(x -> split(x)[2], lines) | |
coors = map(x -> split(x)[3:4]|> x -> parse.(Int64, x), lines) | |
chrs = map(x -> x in ["X", "Y"] ? 0 : parse(Int64, x), chrs) | |
[push!(cnvs_df, [chrs[i] coors[i][1] coors[i][2] copylog[i]]) for i in 1:length(chrs)]; | |
cnvs_df = cnvs_df[(cnvs_df[!, :chr] .!== 0) .& (cnvs_df[!, :copylog] .> logthr), :] | |
end | |
Itree = itree_from_df(cnvs_df, "no_id") | |
return Interval_object(cnvs_df, Itree) | |
end | |
end | |
########################################################################################## | |