modules_SDs.jl


module SDmoduleInit

    using IntervalTrees, DataFrames

    export Initialize_SD_object, Initialize_pyr_object, Interval_object, gap_value, Itree_from_DF

    gap_value = 5;

    mutable struct Interval_object
        int_DF::DataFrame
        int_tree::Dict{Int64, IntervalTrees.IntervalBTree}
    end

    function Read_SDs_file_to_DF(filename::String, comp::Int = 1)
        SD_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[],
            chr2 = Int64[], coor_s2 = Int64[], coor_e2 = Int64[], identity=Float64[])
        open(filename) do sdfile
            lines = readlines(sdfile);
            if comp == 1
                #coors_str = map(x -> strip(x) |> split |> x -> (length(x) >= 26 ? x[[1,2,3,7,8,9,26]] : nothing), lines)
                coors_str = map(x -> strip(x) |> split |> x -> x[[1,2,3,7,8,9,26]], lines)
            elseif comp == 2
                #coors_str = map(x -> strip(x) |> split |> x -> ((length(x) >= 26) && !(startswith(x[1], '#')) ? x[[1,2,3,4,5,6,22]] : nothing), lines)
                coors_str = map(x -> strip(x) |> split |> x -> x[[1,2,3,4,5,6,22]], lines)
            end
            map(x -> push!(SD_df ,[parse(Int64, replace(x[1], r"chr" => "")) [parse.(Int64, x[2:3])...]' parse(Int64, replace(x[4], r"chr" => "")) [parse.(Int64, x[5:6])...]' parse(Float64, x[7])]), coors_str)
        end
        #if comp == 2
        #    SD_df = filter_SEDEF_DF!(SD_df)
        #end
        return SD_df
    end

    function Filter_SDs_dataframe(SD_df::DataFrame, comp::String = "left")
        SD_df_filt = similar(SD_df, 0)
        for line in eachrow(SD_df)
            chr1 = line[:chr]
            chr2 = line[:chr2]
            if comp == "left"
                if chr1 < chr2
                    push!(SD_df_filt, line)
                elseif (chr1 == chr2) & (line[:coor_s] < line[:coor_s2])
                    push!(SD_df_filt, line)
                end
            elseif comp == "both"
                push!(SD_df_filt, line)
            end
        end
        SD_df_filt[!, :ids] = map(x -> "id"*string(x), 1:size(SD_df_filt)[1])
        return SD_df_filt
    end

    function Itree_from_DF(DF::DataFrame, comp::String = "SD_IntTree")
        Itree::Dict{Int64, IntervalTrees.IntervalBTree} = Dict()
        if comp == "SD_IntTree"
            for line in eachrow(DF)
                Itree[line[:chr]] = get(Itree, line[:chr], IntervalTree{Int, IntervalValue{Int, String}}())
                push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], line[:ids]))
                Itree[line[:chr2]] = get(Itree, line[:chr2], IntervalTree{Int, IntervalValue{Int, String}}())
                push!(Itree[line[:chr2]], IntervalValue(line[:coor_s2], line[:coor_e2], line[:ids]))
            end
        elseif comp == "Pyrs_IntTree"
            for line in eachrow(DF)
                Itree = Itree_merge_overlaps(Itree, line[:chr], line[:coor_s], line[:coor_e], gap_value)
                Itree = Itree_merge_overlaps(Itree, line[:chr2], line[:coor_s2], line[:coor_e2], gap_value)
            end
        end
        return Itree
    end

    function Check_sharp_borders_to_ids(SD_df::DataFrame, SD_tree::Dict, comp::String, gap::Int64, gap2::Int64)
        ind_arr = ones(size(SD_df)[1])
        for (i, line) in enumerate(eachrow(SD_df))
            if comp == "first"
                chr, coor_s, coor_e = line[:chr], line[:coor_s], line[:coor_e]
            elseif comp == "second"
                chr, coor_s, coor_e = line[:chr2], line[:coor_s2], line[:coor_e2]
            end
            over = collect(intersect(SD_tree[chr], (coor_s - gap, coor_e + gap)))
            for inter in over
                if ((inter.first - gap <= coor_s) && (coor_s <= inter.first + gap)) || ((inter.last - gap <= coor_e) && (coor_e <= inter.last + gap))
                    if coor_e - coor_s + gap2 < inter.last - inter.first   # don't have to check the id with gap3
                        ind_arr[i] = 0
                    end
                end
                if ((inter.last - gap <= coor_s) && (coor_s <= inter.last + gap)) || ((inter.first - gap <= coor_e) && (coor_e <= inter.first + gap))
                    ind_arr[i] = 1
                    break
                    #fd = 4
                end
            end
        end
        return ind_arr
    end

    function Filt_sharp_borders_SDs(SD_obj::Interval_object, gap::Int64=15, gap2::Int64=100)
        SD_df, SD_tree = SD_obj.int_DF, SD_obj.int_tree
        ind_arr = Check_sharp_borders_to_ids(SD_df, SD_tree, "first", gap, gap2)
        ind_arr2 = Check_sharp_borders_to_ids(SD_df, SD_tree, "second", gap, gap2)
        SD_df2 = SD_df[(ind_arr .== 1) .| (ind_arr2 .== 1), :]                         # in question &
        SD_tree2 = Itree_from_DF(SD_df2, "SD_IntTree")
        return Interval_object(SD_df2, SD_tree2), ind_arr, ind_arr2
    end


    function Itree_merge_overlaps(Itree::Dict, chr::Int64, coor_s::Int64, coor_e::Int64, gap::Int64 = gap_value)
        Itree[chr] = get(Itree, chr, IntervalTree{Int, IntervalValue{Int, String}}())
        over = collect(intersect(Itree[chr], (coor_s - gap, coor_e + gap)))
        if length(over) == 0
            push!(Itree[chr], IntervalValue(coor_s, coor_e, ""))
        else
            left_b = minimum(push!(map(x -> x.first, over), coor_s))
            right_b = maximum(push!(map(x -> x.last, over), coor_e))
            map(x -> delete!(Itree[chr], (x.first, x.last)), over)
            push!(Itree[chr], IntervalValue(left_b, right_b, ""))
        end
        return Itree
    end

    function DF_from_Itree(Itree::Dict)
        Pyrs_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[], ids = String[])
        Itree2::Dict{Int64, IntervalTrees.IntervalBTree} = Dict()
        id_num = 0
        for chr in sort(collect(keys(Itree)))
            for inter in Itree[chr]
                id_num += 1
                push!(Pyrs_df, [chr inter.first inter.last "id"*string(id_num)])
                Itree2[chr] = get(Itree2, chr, IntervalTree{Int, IntervalValue{Int, String}}())
                push!(Itree2[chr], IntervalValue(inter.first, inter.last, "id"*string(id_num)))
            end
        end
        return Itree2, Pyrs_df
    end

    function filter_SEDEF_DF!(a_df)
        a_df = a_df[a_df[!, :identity] .>= 0.9, :]
        inds = min.(a_df[!,:coor_e] - a_df[!,:coor_s], a_df[!,:coor_e2] - a_df[!,:coor_s2]) .>= 999
        a_df = a_df[inds, :]
        return a_df
    end

    function Initialize_SD_object(filename::String, comp::Int=1, comp2::String="left"; comp3::String="all")
        DF_prel = Read_SDs_file_to_DF(filename, comp)
        DF_filt = Filter_SDs_dataframe(DF_prel, comp2)
        Itree_SDs = Itree_from_DF(DF_filt, "SD_IntTree")
        SD_obj = Interval_object(DF_filt, Itree_SDs)
        if comp3 == "filter"
            SD_obj = Filt_sharp_borders_SDs(SD_obj)
        end
        return SD_obj
    end

    function Initialize_pyr_object(DF_filt::DataFrame)
        Itree_prel = Itree_from_DF(DF_filt, "Pyrs_IntTree")
        Itree_pyrs, DF_pyrs = DF_from_Itree(Itree_prel)
        Pyr_obj = Interval_object(DF_pyrs, Itree_pyrs)
        return Pyr_obj
    end

end

#####################################################################################################

module NetworkAnalysis

    using LightGraphs, MetaGraphs, IntervalTrees, DataFrames, Main.SDmoduleInit, GraphPlot, StatsBase
    using Colors, Compose, Plots, Histograms, GLM, Statistics, HypothesisTests, Distributions, ArndtLabTheme
    using LaTeXStrings
    pyplot()
    theme(:arndtlab)

    export Graph_pars_object, Initialize_SD_network, Plot_SD_network, Node_degree_Comp_size_distr, Stat_tests_SD_net_features, Spliting_network_diff_ways

    mutable struct Graph_pars_object
        mg::MetaGraph
        self_loops::Dict
        edges_double::Dict
        edges_type::Dict
        edges_tandem::Dict
    end

    function Extract_fields_from_Graph_object(graph_obj::Graph_pars_object)
        out_array = [getfield(graph_obj, field) for field in [:mg, :self_loops, :edges_double, :edges_type, :edges_tandem]]
        return out_array
    end

    function Graph_from_DF(int_obj::Interval_object, SD_df::DataFrame)
        self_loops::Dict = Dict()
        edges_double::Dict = Dict()
        edges_type::Dict = Dict()
        edges_tandem::Dict = Dict()
        mg = MetaGraph(size(int_obj.int_DF)[1], 1.0)     # 1.0 - default weight

        map(x -> set_prop!(mg, x, :id, int_obj.int_DF[!, :ids][x]), 1:size(int_obj.int_DF)[1])
        set_indexing_prop!(mg, :id)
        for line in eachrow(SD_df)
            over1 = collect(intersect(int_obj.int_tree[line[:chr]], (line[:coor_s], line[:coor_e])))
            over2 = collect(intersect(int_obj.int_tree[line[:chr2]], (line[:coor_s2], line[:coor_e2])))
            if length(over1) != 1 || length(over2) != 1
                display("Initialize_graph_from_DF warning")
            end
            vert1 = mg[over1[1].value, :id]
            vert2 = mg[over2[1].value, :id]

            if has_edge(mg, Edge(vert1, vert2))
                edges_double[(get_prop(mg, vert1, :id), get_prop(mg, vert2, :id))] = get(edges_double, (get_prop(mg, vert1, :id), get_prop(mg, vert2, :id)), 0) + 1
            elseif !has_edge(mg, Edge(vert1, vert2)) && (vert1 != vert2)
                add_edge!(mg, vert1, vert2)
            end

            if vert1 == vert2
                self_loops[get_prop(mg, vert1, :id)] = get(self_loops, get_prop(mg, vert1, :id), 0) + 1
            else
                if line[:chr] == line[:chr2]
                    edges_type[(get_prop(mg, vert1, :id), get_prop(mg, vert2, :id))] = get(edges_type, (get_prop(mg, vert1, :id), get_prop(mg, vert2, :id)), 1)
                    if min(abs(line[:coor_s] - line[:coor_e2]), abs(line[:coor_s2] - line[:coor_e])) < 5e5
                       edges_tandem[(get_prop(mg, vert1, :id), get_prop(mg, vert2, :id))] = get(edges_tandem, (get_prop(mg, vert1, :id), get_prop(mg, vert2, :id)), 1)
                    end
                else
                    edges_type[(get_prop(mg, vert1, :id), get_prop(mg, vert2, :id))] = get(edges_type, (get_prop(mg, vert1, :id), get_prop(mg, vert2, :id)), 0)
                end
            end
        end
        graph_obj = Graph_pars_object(mg, self_loops, edges_double, edges_type, edges_tandem)
        return graph_obj
    end

    function Filter_single_nodes_everywhere!(int_obj::Interval_object, graph_obj::Graph_pars_object)
        mg, self_loops, edges_double, edges_type, edges_tandem = Extract_fields_from_Graph_object(graph_obj)
        node_single = [e[1] for e in filter(x -> length(x) == 1, connected_components(mg))]
        id_single = map(x -> get_prop(mg, x, :id), node_single)
        map(x -> rem_prop!(mg, mg[x, :id], :id), id_single)
        mg = mg[filter_vertices(mg, :id)]
        for id in id_single
            line = int_obj.int_DF[int_obj.int_DF[!, :ids] .== id, :]
            over = collect(intersect(int_obj.int_tree[line[!, :chr][1]], (line[!, :coor_s][1], line[!, :coor_e][1])))
            if length(over) == 1
                delete!(int_obj.int_tree[line[!, :chr][1]], (over[1].first, over[1].last))
            elseif length(over) > 1
                display("Filter_single_nodes_everywhere warning")
            end
            int_obj.int_DF = int_obj.int_DF[int_obj.int_DF[!, :ids] .!= id, :]
            delete!(self_loops, id)
        end
        set_indexing_prop!(mg, :id)
        graph_obj2 = Graph_pars_object(mg, self_loops, edges_double, edges_type, edges_tandem)
        return graph_obj2
    end

    function Initialize_SD_network(int_obj::Interval_object, SD_df::DataFrame)
        graph_obj = Graph_from_DF(int_obj, SD_df);
        graph_obj = Filter_single_nodes_everywhere!(int_obj, graph_obj);
        mg, self_loops, edges_double, edges_type, edges_tandem = Extract_fields_from_Graph_object(graph_obj);
        intra_num = reduce(+, [edges_type[i] for i in keys(edges_type)]);
        inter_num = length(edges_type) - intra_num;

        display("Double edges = $(length(edges_double)) = $(length(edges_double)/ne(mg)) of all edges")
        display("Self loops = $(length(self_loops)) = $(length(self_loops)/nv(mg)) of all nodes (or $(404/(nv(mg)+404-259)) without filtering singleton nodes)")
        display("Tandem edges = $(length(edges_tandem)) = $(length(edges_tandem)/ne(mg)) of all edges and $(length(edges_tandem)/intra_num) of all intra")
        display("Intra = $intra_num and Inter = $inter_num edges")

        return graph_obj;
    end

    function Plot_SD_network(graph_obj::Graph_pars_object, comp::String = "normal")
        mg, self_loops, edges_double, edges_type, edges_tandem = Extract_fields_from_Graph_object(graph_obj)
        if comp == "normal"
            gp = gplot(mg)
            draw(PDF("./pics/SD_net.pdf", 120cm, 120cm), gp)
        elseif comp == "self_loops"
            verts = map(x -> mg[x, :id], collect(keys(self_loops)))
            membership = ones(Int, nv(mg))
            membership[verts] .+= 1
            nodecol = [colorant"lightseagreen", colorant"red"][membership]
            gp = gplot(mg, nodefillc=nodecol)
            draw(PDF("./pics/SD_net_self_loops.pdf", 120cm, 120cm), gp)
        elseif comp == "edges_double"
            edgs = map(x -> Edge(mg[x[1], :id], mg[x[2], :id]), collect(keys(edges_double)))
            membership = map(x -> x in edgs ? 2 : 1, collect(edges(mg)))
            edgecol = [colorant"grey", colorant"red"][membership]
            gp = gplot(mg, edgestrokec=edgecol)
            draw(PDF("./pics/SD_net_double_edges.pdf", 120cm, 120cm), gp)
        elseif comp == "edges_tandem"
            edgs = map(x -> Edge(mg[x[1], :id], mg[x[2], :id]), collect(keys(edges_tandem)))
            membership = map(x -> x in edgs ? 2 : 1, collect(edges(mg)))
            edgecol = [colorant"grey", colorant"red"][membership]
            gp = gplot(mg, edgestrokec=edgecol)
            draw(PDF("./pics/SD_net_tandem_edges.pdf", 120cm, 120cm), gp)                                               elseif comp == "edges_intra"
            edgs = map(x -> edges_type[x] == 1 ? Edge(mg[x[1], :id], mg[x[2], :id]) : 0, collect(keys(edges_type)))
            edgs = edgs[edgs .!= 0]
            membership = map(x -> x in edgs ? 2 : 1, collect(edges(mg)))
            edgecol = [colorant"grey", colorant"red"][membership]
            gp = gplot(mg, edgestrokec=edgecol)
            draw(PDF("./pics/SD_net_intra_edges.pdf", 120cm, 120cm), gp)
        elseif comp == "both_selfloops_intra"
            edgs = map(x -> edges_type[x] == 1 ? Edge(mg[x[1], :id], mg[x[2], :id]) : 0, collect(keys(edges_type)))
            edgs = edgs[edgs .!= 0]
            membership = map(x -> x in edgs ? 2 : 1, collect(edges(mg)))
            edgecol = [colorant"grey", colorant"red"][membership]
            verts = map(x -> mg[x, :id], collect(keys(self_loops)))
            membership = ones(Int, nv(mg))
            membership[verts] .+= 1
            nodecol = [colorant"lightseagreen", colorant"red"][membership]
            gp = gplot(mg, edgestrokec=edgecol, nodefillc=nodecol);
            draw(PDF("./pics/SD_both_selfloops_intra.pdf", 120cm, 120cm), gp);
        end

    end

    function Node_degree_Comp_size_distr(mgs::Array, legends::Array{String,1}, comp=nothing; cc_bin::Int=20, nd_bin::Int=20, thr::Int=5, max_x::Int64=13000)
        colors = ArndtLabTheme.arndtlab_palette
        colors2 = copy(colors)
        colors2[4], colors2[5] = colors[5], colors[4]
        colors2[5], colors2[6] = colors[6], colors[4]
        markers = [:circle, :circle, :circle, :circle, :xcross, :circle, :circle, :circle, :circle]
        m_sizes = [6, 6, 6, 6, 5, 6, 6, 6, 6]
        h = Histograms.Histogram([UInt64(30000)], n=cc_bin, scale=:log)
        cc_p, nd_p, rat_p, rat_m_p = plot(), plot(), plot(), plot()
        for (i, mg) in enumerate(mgs)
            ccs = collect(connected_components(mg))
            cc_len = map(length, ccs);
            ucc_len = convert(Array{UInt,1}, cc_len)
            cc_h = Histograms.Histogram(ucc_len, breaks=h.breaks, scale=:log);
            plot!(cc_p, cc_h, xscale=:log10, yscale=:log10, markershape=markers[i], color=colors2[i],
                    label=legends[i], line=false, markersize=m_sizes[i], xtickfont=Plots.font(13),
                    ytickfont=Plots.font(13), xlim=[1, max_x], legendfontsize=12);
            x = range(2, 1800, length=10); y = 1.5*x.^(-2.7);
            plot!(cc_p, x, y, label="", grid=false, color=colors2[6])
            if comp != nothing
                savefig(comp)
            end

            n_deg = map(x -> length(neighbors(mg, x)), collect(vertices(mg)));
            n_deg = convert.(Float64, n_deg);
            nd_h = Histograms.Histogram(n_deg, N=nd_bin, scale=:identity);
            plot!(nd_p, nd_h, xscale=:identity, yscale=:log10, markershape=markers[i], color=colors2[i], legend=false, grid=false, line=false, markersize=m_sizes[i], xtickfont=Plots.font(13), ytickfont=Plots.font(13));
            #nd_p = plot(nd_h, yscale=:log10, xscale=:log10, markershape=:auto, legend=false, grid=false, line=false, xlim=[1,170]);
            x = range(1, 150, length=10); y = 0.01*exp.(-0.056x);
            plot!(nd_p, x, y, legend=false, grid=false, color=colors2[6])
            #savefig("./pics/node_degree_lin_SD.pdf")

            edgs::Array{Int64,1} = []
            for cc in ccs
                push!(edgs, sum([length(neighbors(mg, node)) for node in cc])/2)
            end
            plot!(rat_p, cc_len, edgs, xscale=:log10, yscale=:log10, color=colors2[i], markersize=m_sizes[i], markershape=markers[i], line=false, label="", grid=false, xtickfont=Plots.font(13), ytickfont=Plots.font(13))
            b, k, rat_p = Regression_from_2_arrays(cc_len, edgs, ((cc_len .> thr)), rat_p, [1, 2000], comp="log", color=colors2[i])
            m_edgs = [mean(edgs[cc_len .== vals]) for vals in sort(unique(cc_len))]
            m_cc_len = sort(unique(cc_len))
            plot!(rat_m_p, m_cc_len, m_edgs, xscale=:log10, yscale=:log10, color=colors2[i], markershape=markers[i], line=false, grid=false, label="", markersize=m_sizes[i], xtickfont=Plots.font(13), ytickfont=Plots.font(13))
            xs = range(1, 2000, length=10);
            ys = (10^b)*xs.^k;
            plot!(rat_m_p, xs, ys, label="α = $k", color=colors2[i])
            #b, k, rat_m_p = Regression_from_2_arrays(m_cc_len, m_edgs, ((m_cc_len .> 5)), rat_m_p, [1, 2000], comp="log", color=colors2[i])
            #savefig("./pics/comps_nodes_edges_SD.pdf")
        end

        display(cc_p)
        #savefig(cc_p, "./pics/animals_cc_distr.pdf")
        display(nd_p)
        #savefig(nd_p, "./pics/animals_nd_distr.pdf")
        display(rat_p)
        gui(rat_m_p)
        #savefig(rat_m_p, "./pics/animals_m_n_e_distr.pdf")
    end

    function Regression_from_2_arrays(arrX::Array, arrY::Array, cond::BitArray{1}, plotobj::Plots.Plot, xlim::Array; comp::String = "linear", color=:red)
        arrX = arrX[cond]; arrY = arrY[cond]
        if comp == "linear"
            data = DataFrame(X=arrX, Y=arrY)
        elseif comp == "log"
            data = DataFrame(X=log10.(arrX), Y=log10.(arrY))
        end
        slm = lm(@formula(Y ~ X), data)
        b, k = coef(slm)
        x = range(xlim[1], xlim[2], length=10);
        if comp == "linear"
           y = k*x .+ b;
        elseif comp == "log"
           y = (10^b)*x.^k;
        end
        k_round = round(k, digits=2)
        plot!(plotobj, x, y, label="α = $k_round", color=color)
        display(k)
        return b, k_round, plotobj
    end

    function Stat_tests_SD_net_features(graph_obj::Graph_pars_object, limits::Array{Int64,1}=[10, 1000])
        mg, self_loops, edges_double, edges_type, edges_tandem = Extract_fields_from_Graph_object(graph_obj)
        big_nodes::Array{String,1} = []
        other_nodes::Array{String,1} = []
        middle_ccs::Array{Array{Int64,1},1} = []
        middle_intra::Array{Int64, 1} = []
        middle_lens::Array{Int64, 1} = []

        ccs = collect(connected_components(mg))
        bc_size = maximum(map(length, ccs))
        map(x -> length(x) == bc_size ? append!(big_nodes, Node_array_to_back_ids(mg, x)) : append!(other_nodes, Node_array_to_back_ids(mg, x)), ccs)

        f_plus, f_minus = 0, 0
        map(x -> x in big_nodes ? f_plus += 1 : f_minus += 1, collect(keys(self_loops)))
        pv_self = pvalue(FisherExactTest(f_plus, f_minus, length(big_nodes), length(other_nodes)))
        display([f_plus/length(big_nodes) f_minus/length(other_nodes)])
        display("Fisher test p-value for self_loops in big/other components = $pv_self")

        f_plus, f_minus = 0, 0
        arr_intra = collect(filter(x -> edges_type[x] == 1, collect(keys(edges_type))))
        map(x -> (x[1] in big_nodes) && (x[2] in big_nodes) ? f_plus += 1 : f_minus += 1, arr_intra)
        ne_big = sum(map(x -> neighbors(mg, mg[x, :id]) |> length, big_nodes))/2
        ne_other = ne(mg) - ne_big
        pv_intra = pvalue(FisherExactTest(f_plus, f_minus, Int(ne_big), Int(ne_other)))
        display([f_plus/Int(ne_big) f_minus/Int(ne_other)])
        display("Fisher test p-value for intra_edges in big/other components = $pv_intra")

        f_plus, f_minus = 0, 0
        map(x -> (x[1] in big_nodes) && (x[2] in big_nodes) ? f_plus += 1 : f_minus += 1, collect(keys(edges_double)))
        ne_big = sum(map(x -> neighbors(mg, mg[x, :id]) |> length, big_nodes))/2
        ne_other = ne(mg) - ne_big
        pv_double = pvalue(FisherExactTest(f_plus, f_minus, Int(ne_big), Int(ne_other)))
        display([f_plus/Int(ne_big) f_minus/Int(ne_other)])
        display("Fisher test p-value for double_edges in big/other components = $pv_double")

        middle_ccs = filter(x -> (length(x) >= limits[1]) && (length(x) <= limits[2]) , ccs)
        for comp in middle_ccs
            comp_intra = 0
            gr = induced_subgraph(mg, comp)[1]
            set_indexing_prop!(gr, :id)
            push!(middle_lens, ne(gr))
            for edg in edges(gr)
                edge = (gr[edg.src, :id], gr[edg.dst, :id])
                comp_intra += edges_type[edge]
            end
            push!(middle_intra, comp_intra)
        end
        chi_val = Calculate_Chi_square(middle_lens, middle_intra)
        simul_chi_vals = Simul_intra_distribution(middle_lens, sum(middle_intra))
        #display(middle_intra./middle_lens)
        num_more = count(x -> x >= chi_val, simul_chi_vals)
        display("There are $num_more simulations with bigger intra- fraction value than observed in middle size components out of $(length(simul_chi_vals))")

    end

    function Node_array_to_back_ids(mg::MetaGraph, node_arr::Array)
        [mg[i, :id] for i in node_arr]
    end

    function Calculate_Chi_square(ns::Array{Int64,1}, ks::Array{Int64,1})
        p = sum(ks)/sum(ns)
        sum(((p*ns .- ks).^2)./(p*ns))
    end

    function Simul_intra_distribution(ns::Array{Int64,1}, num::Int64, iter::Int64=1000)
        vals::Array{Float64,1} = []
        prs = ns/sum(ns)
        distr = Multinomial(num, prs)
        for i in 1:iter
            push!(vals, Calculate_Chi_square(ns, rand(distr)))
        end
        return vals
    end


    function Spliting_network_diff_ways(graph_obj::Graph_pars_object, comp::String)
        mg, self_loops, edges_double, edges_type, edges_tandem = Extract_fields_from_Graph_object(graph_obj)
        cc = connected_components(mg)
        lens = map(length, cc)
        big_nodes = Node_array_to_back_ids(mg, cc[lens .== maximum(lens)][1])
        if comp == "intra" || comp == "intra_nobig"
            arr_intra = collect(filter(x -> edges_type[x] == 1, collect(keys(edges_type))))
            edges_intra = map(x -> Edge(mg[x[1], :id], mg[x[2], :id]), arr_intra)
            gr = induced_subgraph(mg, edges_intra)[1]
            set_indexing_prop!(gr, :id)
            if comp == "intra_nobig"
                node_arr = filter(ver -> !(gr[ver, :id] in big_nodes), vertices(gr))
                gr = induced_subgraph(gr, node_arr)[1]
                set_indexing_prop!(gr, :id)
            end
            return gr
        elseif comp == "inter" || comp == "inter_nobig"
            arr_inter = collect(filter(x -> edges_type[x] == 0, collect(keys(edges_type))))
            edges_inter = map(x -> Edge(mg[x[1], :id], mg[x[2], :id]), arr_inter)
            gr = induced_subgraph(mg, edges_inter)[1]
            set_indexing_prop!(gr, :id)
            if comp == "inter_nobig"
                node_arr = filter(ver -> !(gr[ver, :id] in big_nodes), vertices(gr))
                gr = induced_subgraph(gr, node_arr)[1]
                set_indexing_prop!(gr, :id)
            end
            return gr
        end
    end
end

###########################################################################################################

module PyrSDTypes

    using StatsBase, Statistics, DataFrames, Histograms, Plots, MetaGraphs, LightGraphs, StatsPlots, GLM, Main.SDmoduleInit, Main.NetworkAnalysis, DecisionTree, Random, ArndtLabTheme
    pyplot()
    theme(:arndtlab)

    export Real_SD_length_distr, Pyr_length_node_degree!, Modify_DF_for_RandForset!

    function Real_SD_length_distr(Itree_SDs::Dict, Itree_pyrs::Dict, sds_df::DataFrame)
        real_SDs::Array{String, 1} = []
        for chr in keys(Itree_SDs)
            sing_SD = filter(x -> length(collect(intersect(Itree_SDs[chr], x))) == 1, collect(Itree_SDs[chr]))
            append!(real_SDs, [i.value for i in sing_SD])
        end
        real_SDs_df = sds_df[map(x -> x in real_SDs, sds_df[!, :ids]),:]
        #real_SDs_df = real_SDs_df[0.98 .< real_SDs_df[!, :identity] .< 0.9999, :]
        lens::Array{Float64,1} = real_SDs_df[!, :coor_e] - real_SDs_df[!, :coor_s]
        cc_h = Histograms.Histogram(lens, N=30, scale=:log10);
        cc_p = plot(cc_h, xscale=:log10, yscale=:log10, markershape=:auto, line=false);
        x = range(10^3, 10^5.5, length=10); y = 900*x.^(-2);
        cc_p = plot!(x, y, label="a = -2")
        display(mean(lens))
        display(cc_p)
    end

    function Pyr_length_node_degree!(pyrs_df::DataFrame, mg::MetaGraph)
        node_len::Dict = Dict()
        pyrs_df[!, :degree] = zeros(size(pyrs_df)[1])
        pyrs_df[!, :length] = convert.(Float64, pyrs_df[!, :coor_e] .- pyrs_df[!, :coor_s])
        for i in vertices(mg)
            neis = convert(Float64, length(neighbors(mg, i)))
            len = pyrs_df[pyrs_df[!, :ids] .== get_prop(mg, i, :id), :length][1]
            pyrs_df[pyrs_df[!, :ids] .== get_prop(mg, i, :id), :degree] = neis
            node_len[neis] = push!(get(node_len, neis, Float64[]), len)
        end

        nd = sort(collect(keys(node_len)))
        mlen = map(x -> mean(node_len[x]), nd)
        p = plot(nd[nd .< 30], mlen[nd .< 30], markershape=:auto, line=false, legend=false, markersize=6, xtickfont=Plots.font(13), xticks=[0,5,10,15,20,25,30], xlim=[0,30], ytickfont=Plots.font(13))
        p = plot!(nd[nd .< 30], 3625 .+ 3700*nd[nd .< 30])
        display(p)
        #savefig("./pics/length_nd_SD.pdf")
        p = plot(nd, mlen, markershape=:auto, line=false)
        p = plot!(nd[nd .< 30], 3625 .+ 3700*nd[nd .< 30])
        display(p)
        q = plot(log10.(pyrs_df[!, :degree]), log10.(pyrs_df[!, :length]), markershape=:auto, line=false, markersize = 1.0)
        gui(q)
    end

    function Pyr_nd_bins_length(pyrs_df::DataFrame, fr_starts::Array, fr_ends::Array)
        low_freq_len = pyrs_df[fr_starts[1] .<= pyrs_df[:degree] .<= fr_ends[1] , :length]
        mid_freq_len = pyrs_df[fr_starts[2] .<= pyrs_df[:degree] .<= fr_ends[2] , :length]
        high_freq_len = pyrs_df[fr_starts[3] .<= pyrs_df[:degree] .<= fr_ends[3] , :length]
        #p = histogram(low_freq_len, bins=:scott, weights=repeat(1:5, outer=200))
    end


    function Modify_DF_for_RandForset!(a::Interval_object, b::Interval_object, segs_int::Interval_object, graph_obj::Graph_pars_object, mean_covs::Array{Float64,1})

        df = DataFrame(length=Float64[], self=Int64[], double=Float64[], degree=Float64[], comp_s=Float64[], mean_cov=Float64[], breaks = Int64[], max_cov = Int64[], intra_frac = Float64[], max_len = Int64[])

        d_e = Dict()
        breaks = Dict()
        ls = Int64[]
        b.int_DF[!, :mean_cov] = mean_covs

        b.int_DF[!, :comp_size] = zeros(size(b.int_DF)[1])
        ccs = connected_components(graph_obj.mg)
        for com in ccs
            len = length(com)
            map(x -> get_prop(graph_obj.mg, x, :id) |> x -> b.int_DF[b.int_DF[!, :ids] .== x, :comp_size] = len, com)
        end

        for i in keys(graph_obj.edges_double)
            d_e[i[1]] = get(d_e, i[1], 0) + graph_obj.edges_double[i]
            d_e[i[2]] = get(d_e, i[2], 0) + graph_obj.edges_double[i]
        end

        for i in keys(b.int_tree)
            for j in collect(intersect(b.int_tree[i], a.int_tree[i]))
                breaks[j[1].value] = get(breaks, j[1].value, reshape(Int64[], (0, 2)))
                breaks[j[1].value] = vcat(breaks[j[1].value], [j[2].first j[2].last])
            end
        end

        for i in keys(breaks)
            l = 0
            for j in 0:10
                l += length(unique(breaks[i][map(x -> x + j - 5 in breaks[i][:,2], breaks[i][:,1]),2]))
            end
            push!(ls, l)
        end

        b.int_DF[!, :pyr_breaks] = ls

        for i in eachrow(b.int_DF)
            deg = i[:degree]
            len = i[:length]
            mcov = i[:mean_cov]
            brks = i[:pyr_breaks]
            comp_s = i[:comp_size]
            self = get(graph_obj.self_loops, i[:ids], 0)
            double = get(d_e, i[:ids], 0)
            sds = collect(intersect(a.int_tree[i[:chr]], (i[:coor_s], i[:coor_e])))
            max_len = maximum(map(x -> x.last - x.first + 1, sds))
            intra_frac = mean(map(x -> i[:chr] == a.int_DF[a.int_DF[!, :ids] .== x.value, :chr2][1] ? 1.0 : 0.0, sds))
            segs = collect(intersect(segs_int.int_tree[i[:chr]], (i[:coor_s], i[:coor_e])))
            max_cov = maximum(map(x -> x.value, segs))
            push!(df, [len self double deg comp_s mcov brks max_cov intra_frac max_len])
        end

        b.int_DF[!, :intra_frac] = df[:, :intra_frac]

        lms = lm(@formula(length ~ self + double + degree + comp_s + mean_cov + breaks + max_cov + intra_frac), df)
        display(r2(lms))
        display(lms)
        return df
    end

end

##############################################################################################

module SegmentBlocks

    using IntervalTrees, Main.SDmoduleInit, DataFrames, StatsBase, Statistics, GenomicFeatures, IntervalSets, Histograms, Plots, ArndtLabTheme
    pyplot()
    theme(:arndtlab)

    export pyramid_coverage_calc, plot_segments_distribution, itree_from_df

    function find_segments_from_it(int_SDs::Interval_object, int_pyrs::Interval_object)
        dic_starts, dic_ends = Dict(), Dict()

        for chr in keys(int_pyrs.int_tree)
            for it in int_pyrs.int_tree[chr]
                sds_set = intersect(int_SDs.int_tree[chr], it)
                for sd in sds_set
                    dic_starts[chr] = get(dic_starts, chr, Int64[])
                    dic_ends[chr] = get(dic_ends, chr, Int64[])
                    push!(dic_starts[chr], sd.first)
                    push!(dic_ends[chr], sd.last)
                end
            end
        end

        out_df = find_segments_second(dic_starts, dic_ends)
        itree = itree_from_df(out_df)

        return Interval_object(out_df, itree)
    end

    function find_segments_second(dic_starts::Dict, dic_ends::Dict)

        out_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[], length = Int64[], cover = Int64[])

        map(x -> sort!(dic_starts[x]), collect(keys(dic_starts)))
        map(x -> sort!(dic_ends[x]), collect(keys(dic_ends)))

        for chr in keys(dic_starts)
            st_ind, en_ind, cover = 1, 1, 0
            covers = Int64[]

            push!(dic_starts[chr], 5e10)
            push!(dic_ends[chr], 5e10)
            while en_ind <= length(dic_ends[chr]) - 1
                if dic_starts[chr][st_ind] < dic_ends[chr][en_ind]
                    cover += 1
                    st_ind += 1
                elseif dic_starts[chr][st_ind] >= dic_ends[chr][en_ind]
                    cover -= 1
                    en_ind += 1
                end
                push!(covers, cover)
            end

            coors = sort(append!(dic_starts[chr], dic_ends[chr]))
            for i in 1:(length(coors) - 3)                          # because added 2 big values;
                push!(out_df, (chr, coors[i], coors[i + 1], coors[i + 1] - coors[i], covers[i]))
            end
        end
        out_df = out_df[(out_df[!, :length] .!= 0) .& (out_df[!, :cover] .!= 0), :]

        return out_df
    end


    function itree_from_df(cnvs_df::DataFrame, comp::String = "cover")
        Itree::Dict{Int64, IntervalTrees.IntervalBTree} = Dict()

        for line in eachrow(cnvs_df)
            Itree[line[:chr]] = get(Itree, line[:chr], IntervalTree{Int, IntervalValue{Int, Int}}())
            if comp == "cover"
                push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], line[:cover]))
            elseif comp == "nothing"
                push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], 0))
            end
        end

        return Itree
    end


    function pyramid_coverage_calc(int_pyrs::Interval_object, int_sds::Interval_object)
        mean_covs = Float64[]
        all_sds::Array{GenomicFeatures.Interval, 1} = []

        for line in eachrow(int_pyrs.int_DF)
            len = line[:length]
            chr, coor1, coor2 = line[:chr], line[:coor_s], line[:coor_e]

            sds = collect(intersect(int_sds.int_tree[chr], (coor1, coor2)))
            sds2 = [IntervalSets.intersect(coor1..coor2, i.first..i.last) for i in sds]
            sds3 = [(i.left - coor1)/len..(i.right - coor1)/len for i in sds2]
            sds4 = [GenomicFeatures.Interval("1", 1 + floor(Int, 10000*i.left), 1 + floor(Int, 10000*i.right)) for i in sds3]

            if length(sds4) > 0
                append!(all_sds, sds4)
                cov_loc = coverage(sort(sds4))
                mean_cov = sum(map(x -> (rightposition(x) - leftposition(x) + 1)*metadata(x), cov_loc))/10001
                push!(mean_covs, mean_cov)
            else
                push!(mean_covs, 0.0)
            end
        end

        covers = coverage(sort(all_sds))
        return mean_covs, covers
    end


    function plot_segments_distribution(a::Interval_object, b::Interval_object; len_thr::Int64=1000)

        segs_int = SegmentBlocks.find_segments_from_it(a, b)

        sf = segs_int.int_DF[segs_int.int_DF[!, :length] .> len_thr, :]

        cc_h = Histograms.Histogram(collect(sf[!, :cover]), N=40, scale=:log10)
        cc_p = plot(cc_h, xscale=:log10, yscale=:log10, markershape=:auto, line=false)
        x = range(1, 100, length=10); y = 2*x.^(-2);
        cc_p = plot!(x, y)
        gui(cc_p)

        display("Only long segments genome fraction:")
        display(sum(segs_int.int_DF[segs_int.int_DF[!, :length] .> len_thr, :length])/2.88e9)
        display("All segments genome fraction:")
        display(sum(segs_int.int_DF[!, :length])/2.88e9)

        return segs_int
    end

end

###############################################################################################


module CNVspart

    using DataFrames, Main.SDmoduleInit, IntervalTrees, IntervalSets, StatsBase, GenomicFeatures, Plots, ArndtLabTheme
    pyplot()
    theme(:arndtlab)

    export read_CNVs_to_DF, shuffle_DF_to_Intobject, overlap_CNVs_pyrs!, CNV_rare_fixed_overlap, find_gene_overlap!, itree_from_df

    function read_CNVs_to_DF(file_SDs::String = "../CNV_pop_cancer/CNVs_only_popul_hg38.txt", ac_thr::Array{Int64,1} = [1, 2504], samplesize::Int64=5008)

        cnvs_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[], ac = Int64[])

        open(file_SDs) do sdfile
            lines = readlines(sdfile)
            acs = []
            for l in lines
                a = split(l)
                cn_vals = [x.match for x in eachmatch(r"<CN[0-9]+>", a[5])]
                m = match(r"AC=([0-9,]+);", a[8])
                ac_vals = parse.(Int64, split(m.captures[1], ","))
                ac = sum(ac_vals[cn_vals .!= "<CN0>"])
                push!(acs, ac <= samplesize/2 ? ac : samplesize - ac)
            end

            #acs = map(x -> strip(x) |> x -> split(x)[8] |> x -> split(x, r",|;")[1] |> x -> split(x, "=")[2] |> x -> parse(Int64, x) |> x -> x <= samplesize/2 ? x : samplesize - x, lines)
            chrs = map(x -> split(x)[1], lines)

            coor_s = map(x -> split(x)[2] |> x -> parse(Int64, x), lines)
            coor_e = map(x -> split(x)[8] |> x -> match(r"END=[0-9]+", x).match |> x -> split(x, "=")[2] |> x -> parse(Int64, x), lines)
            chrs = map(x -> x in ["X", "Y", "M"] ? 0 : parse(Int64, x), chrs)
            [push!(cnvs_df, [chrs[i] coor_s[i] coor_e[i] acs[i]]) for i in 1:length(chrs)];
            cnvs_df = cnvs_df[(cnvs_df[!, :chr] .!== 0) .& (ac_thr[1] .<= cnvs_df[!, :ac] .<= ac_thr[2]), :]
        end

        Itree = itree_from_df(cnvs_df)

        return Interval_object(cnvs_df, Itree)
    end

    function overlap_CNVs_pyrs!(cnv_int::Interval_object, pyrs_int::Interval_object, comp::String = "real"; pnum::Int64 = 1000)

        if comp == "real"
            pyrs_int.int_DF[!, :cnv_over] = zeros(Int64, size(pyrs_int.int_DF)[1])
        elseif comp == "shuffle"
            pyrs_int.int_DF[!, :shuff_over] = zeros(Int64, size(pyrs_int.int_DF)[1])
        elseif comp == "perm_over"
            pyrs_int.int_DF[!, :cnv_over] = zeros(Int64, size(pyrs_int.int_DF)[1])
            pyrs_int.int_DF[!, :over_len] = zeros(Float64, size(pyrs_int.int_DF)[1])
            pyrs_int.int_DF[!, :nor_left] = zeros(Float64, size(pyrs_int.int_DF)[1])
            pyrs_int.int_DF[!, :nor_right] = zeros(Float64, size(pyrs_int.int_DF)[1])
            perm_overs = reshape(Float64[], (0, pnum))
        elseif comp == "shuffle2"
            pyrs_int.int_DF[!, :shuff_over] = zeros(Int64, size(pyrs_int.int_DF)[1])
            pyrs_int.int_DF[!, :nor_left] = zeros(Float64, size(pyrs_int.int_DF)[1])
            pyrs_int.int_DF[!, :nor_right] = zeros(Float64, size(pyrs_int.int_DF)[1])
        end

        for k in keys(pyrs_int.int_tree)
            if k in keys(cnv_int.int_tree)
                for inter in intersect(pyrs_int.int_tree[k], cnv_int.int_tree[k])
                    if comp == "real"
                        pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :cnv_over] = pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :cnv_over][1] + 1
                    elseif comp == "shuffle"
                        pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :shuff_over] = pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :shuff_over][1] + 1
                    elseif comp == "perm_over"
                        l_over, lens_exp, nor_left, nor_right = calc_overlap_fraction_predict(inter[1].first..inter[1].last, inter[2].first..inter[2].last, pnum)
                        if pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :cnv_over][1] == 0
                            pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_left] = nor_left
                            pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_right] = nor_right
                        else
                            pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_left] = 0.0
                            pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_right] = 1.0
                        end
                        pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :cnv_over] = pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :cnv_over][1] + 1
                        pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :over_len] = pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :over_len][1] + l_over
                        perm_overs = vcat(perm_overs, lens_exp')
                    elseif comp == "shuffle2"
                        l_over, lens_exp, nor_left, nor_right = calc_overlap_fraction_predict(inter[1].first..inter[1].last, inter[2].first..inter[2].last, 1)
                        if pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :shuff_over][1] == 0
                            pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_left] = nor_left
                            pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_right] = nor_right
                        else
                            pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_left] = 0.0
                            pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :nor_right] = 1.0
                        end
                        pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :shuff_over] = pyrs_int.int_DF[pyrs_int.int_DF[!, :ids] .== inter[1].value, :shuff_over][1] + 1
                    end
                end
            end
        end

        if comp == "perm_over"
            sums = [sum(perm_overs[!, i]) for i in 1:size(perm_overs)[2]]
            pval = countmap(sums .> sum(pyrs_int.int_DF[!, :over_len]))[:true]/pnum
            display("p-value = $(pval) it means nothing")
        end

        if comp == "shuffle2" || comp == "perm_over"
            only_over = pyrs_int.int_DF[pyrs_int.int_DF[!, :nor_right] - pyrs_int.int_DF[!, :nor_left] .> 0.0, :]
            ints = [GenomicFeatures.Interval("1", 1 + floor(Int, 10000*i[:, :nor_left]), 1 + floor(Int, 10000*i[:, :nor_right])) for i in eachrow(only_over)]
            covers = coverage(sort(ints))
            return covers, sort(ints)
        end
    end

    function shuffle_DF_to_Intobject(in_df::DataFrame, chr_file::String = "chrom.size.hg38")
        ch_size = Dict()
        df_sim = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[])

        open(chr_file) do chr_f
            lines = readlines(chr_f);
            map(x -> strip(x) |> split |> x -> ch_size[parse(Int64, strip(x[1], ['c', 'h', 'r']))] = parse(Int64, x[2]), lines)
        end

        for line in eachrow(in_df)
            lensd = 1 + Int(line[:coor_e] - line[:coor_s])
            stpos = rand(1:(ch_size[line[:chr]] - lensd), 1)[1]
            enpos = stpos + lensd
            chr = line[:chr]
            push!(df_sim, [chr stpos enpos])
        end

        Itree = itree_from_df(df_sim, "no_id")

        return Interval_object(df_sim, Itree)
    end


    function itree_from_df(cnvs_df::DataFrame, comp::String = "ac_id")
        Itree::Dict{Int64, IntervalTrees.IntervalBTree} = Dict()

        for line in eachrow(cnvs_df)
            if comp == "ac_id"
                Itree[line[:chr]] = get(Itree, line[:chr], IntervalTree{Int, IntervalValue{Int, Int}}())
                push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], line[:ac]))
            elseif comp == "no_id"
                Itree[line[:chr]] = get(Itree, line[:chr], IntervalTree{Int, IntervalValue{Int, Int}}())
                push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], 0))
            else comp == "id_id"
                Itree[line[:chr]] = get(Itree, line[:chr], IntervalTree{Int, IntervalValue{Int, String}}())
                push!(Itree[line[:chr]], IntervalValue(line[:coor_s], line[:coor_e], line[:ids]))
            end
        end
        return Itree
    end


    function calc_overlap_fraction_predict(pyr::IntervalSets.Interval, cnv::IntervalSets.Interval, pnum::Int64)
        l1, l2 = duration(pyr), duration(cnv)
        over = IntervalSets.intersect(pyr, cnv)
        nor_left = (over.left - pyr.left)/l1
        nor_right = 1.0 - (pyr.right - over.right)/l1

        l_over = duration(over)
        lens_exp::Array{Float64,1} = []

        for i in 1:pnum
            p1 = rand(1:(l1+l2))
            p2 = p1 + l1
            l_perm = float(duration(IntervalSets.intersect(p1..p2, l1..(l1+l2))))
            push!(lens_exp, l_perm)
        end

        return float(l_over), lens_exp, nor_left, nor_right
    end

    function plot_coverage(covers, num::Int64 = 1)
        xs, ys = [], []
        for i in covers
            append!(xs, [leftposition(i), rightposition(i)])
            append!(ys, [metadata(i), metadata(i)])
        end
        p = plot(xs, ys/num)
        return p
    end

    function CNV_rare_fixed_overlap(b::Interval_object, CNV_int::Interval_object, comp::String = "norm", comp2::String = "both", n::Int64 = 100)

        if comp2 == "both"
            piece = deepcopy(b)
        elseif comp2 == "gene"
            piece_df = deepcopy(b.int_DF[b.int_DF[!, :gene_over] .== 1.0, :])
            piece_tree = itree_from_df(piece_df, "id_id")
            piece = Interval_object(piece_df, piece_tree)
        elseif comp2 == "nogene"
            piece_df = deepcopy(b.int_DF[b.int_DF[!, :gene_over] .== 0.0, :])
            piece_tree = itree_from_df(piece_df, "id_id")
            piece = Interval_object(piece_df, piece_tree)
        end

        mer = zeros(size(piece.int_DF)[1])
        bins = Int64[1 1; 2 5; 6 200]
        overs_df = DataFrame(low_f = Float64[], middle_f = Float64[], high_f = Float64[])

        if comp == "shuff"
            for i in 1:n
                cnv_shuff = CNVspart.shuffle_DF_to_Intobject(CNV_int.int_DF)
                CNVspart.overlap_CNVs_pyrs!(cnv_shuff, piece, "shuffle")
                mer .+= piece.int_DF[!, :shuff_over]
                push!(overs_df, CNVspart.binning_lin_frequency(piece, bins, :shuff_over))
            end
            mer = mer/n
           # b.int_DF[!, :shuff_over] = mer
        elseif comp == "norm"
            push!(overs_df, CNVspart.binning_lin_frequency(piece, bins, :cnv_over))
        end

        return overs_df
    end

    function binning_lin_frequency(b::Interval_object, bins::Array{Int64,2}, col)
        c_mer = Float64[]
        for i in eachrow(bins)
            spl = b.int_DF[i[1] .<= b.int_DF[!, :degree] .<= i[2], :]
            push!(c_mer, sum(spl[!, col])/size(spl)[1])
        end
        return c_mer
    end

    function find_gene_overlap!(any_int::Interval_object, filename::String)
        gene_int = read_gene_file(filename)
        any_int.int_DF[!, :gene_over] = zeros(size(any_int.int_DF)[1])
        for row in eachrow(any_int.int_DF)
            chr, coor_s, coor_e = row[:chr], row[:coor_s], row[:coor_e]
            over = collect(intersect(gene_int.int_tree[chr], (coor_s, coor_e)))
            if length(over) > 0
                row[:gene_over] = 1.0
            end
        end
    end

    function read_gene_file(filename::String)
        genes_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[])
        open(filename) do genefile
            lines = readlines(genefile)
            chrs = map(x -> split(x)[1], lines)
            coors = map(x -> split(x)[2:3] |> x -> parse.(Int64, x), lines)
            chrs = map(x -> replace(x, r"chr" => "") |> x -> x in ["X", "Y"] ? 0 : parse(Int64, x), chrs)
            [push!(genes_df, [chrs[i] coors[i][1] coors[i][2]]) for i in 1:length(chrs)];
            genes_df = genes_df[genes_df[!, :chr] .!== 0,:]
        end
        Itree = itree_from_df(genes_df, "no_id")

        return Interval_object(genes_df, Itree)
    end

end


module CNVs_TCGA

    using DataFrames, Main.SDmoduleInit, Main.CNVspart, IntervalTrees, IntervalSets, StatsBase

    export read_CNVs_TCGA

    function read_CNVs_TCGA(file_CNVs::String = "../TCGA_CNV_data/CNVs_BRCA_all.txt", logthr::Float64 = 0.3)

        cnvs_df = DataFrame(chr = Int64[], coor_s = Int64[], coor_e = Int64[], copylog = Float64[])

        open(file_CNVs) do cnvfile
            lines = readlines(cnvfile)
            copylog = map(x -> split(x)[6] |> x -> parse(Float64, x), lines)
            chrs = map(x -> split(x)[2], lines)
            coors = map(x -> split(x)[3:4]|> x -> parse.(Int64, x), lines)
            chrs = map(x -> x in ["X", "Y"] ? 0 : parse(Int64, x), chrs)
            [push!(cnvs_df, [chrs[i] coors[i][1] coors[i][2] copylog[i]]) for i in 1:length(chrs)];
            cnvs_df = cnvs_df[(cnvs_df[!, :chr] .!== 0) .& (cnvs_df[!, :copylog] .> logthr), :]
        end

        Itree = itree_from_df(cnvs_df, "no_id")

        return Interval_object(cnvs_df, Itree)
    end

end


##########################################################################################