Skip to content

Commit

Permalink
remove cludge
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebd99 committed Dec 7, 2023
1 parent 513e73d commit 4593325
Show file tree
Hide file tree
Showing 11 changed files with 70 additions and 79 deletions.
1 change: 1 addition & 0 deletions Experiments/Experiments.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ using CSV, DataFrames
using Parquet2: Dataset
using DelimitedFiles: writedlm
using BenchmarkTools
using Random

include("../Source/CardinalityWithColors.jl")
include("utils.jl")
Expand Down
4 changes: 1 addition & 3 deletions Experiments/Scripts/coloring_methods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@ partitioning_schemes = [
[(Hash, 64)],
[(Degree, 8), (QuasiStable, 32), (NeighborNodeLabels, 24)],
[(Degree, 8), (NeighborNodeLabels, 24), (QuasiStable, 32)]]
partitioning_schemes = [
[(QuasiStable, 64)],
]
partitioning_schemes = [[(QuasiStable, 64)]]
experiment_params = Vector{ExperimentParams}()
for dataset in datasets
for scheme in partitioning_schemes
Expand Down
4 changes: 2 additions & 2 deletions Experiments/Scripts/memory_exps.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ experiment_params = Vector{ExperimentParams}()
build_params = Vector{ExperimentParams}()
for dataset in datasets
for n in num_colors
push!(build_params, ExperimentParams(dataset=dataset, num_colors=n))
push!(build_params, ExperimentParams(dataset=dataset, partitioning_scheme=[(QuasiStable, n)]))
end
end
build_experiments(build_params)

graph_grouped_bar_plot(build_params; grouping=number_of_colors,
y_type=memory_footprint,
y_lims=[0, 16],
filename="memory_size_vs_colors_fp32_int16")
filename="memory_size_vs_colors_fp32_int16_2")
14 changes: 6 additions & 8 deletions Experiments/Scripts/update_experiments.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
using Plots.PlotMeasures
using Graphs
include("../Experiments.jl")

# datasets::Vector{DATASET} = [wordnet]
datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
datasets::Vector{DATASET} = [wordnet]
# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
max_cycles = 6
proportions_not_updated = [0, 0.2, 0.4, 0.6, 0.8, 1]
proportions_not_updated = [0.05, 0.2, 0.4, 0.6, 0.8, 1]

experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_cycle, proportion_not_updated=current_proportion)
for current_dataset in datasets for current_cycle in 2:max_cycles for current_proportion in proportions_not_updated]
experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, max_cycle_size=current_cycle, proportion_not_updated=current_proportion)
for current_dataset in datasets for current_cycle in 6:max_cycles for current_proportion in proportions_not_updated]
println("started building")
build_experiments(experiment_params_list)
println("started estimating")
Expand All @@ -18,4 +16,4 @@ println("started graphing")
# compare how overall accuracy is affected by summary updates
graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=proportion_not_updated, filename="overall-accuracy-and-updates")
# compare how cycle stat accuracies are affected by summary updates
graph_grouped_box_plot(experiment_params_list, x_type=proportion_not_updated, y_type=estimate_error, grouping=cycle_size, filename="cycle-stats-and-updates")
graph_grouped_box_plot(experiment_params_list, x_type=proportion_not_updated, y_type=estimate_error, grouping=cycle_size, filename="cycle-stats-and-updates")
4 changes: 3 additions & 1 deletion Experiments/graph_results.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase proportion_not_updated proportion_deleted
@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase proportion_not_updated proportion_deleted deg_stat_type

@enum VALUE estimate_error runtime build_time memory_footprint

Expand Down Expand Up @@ -247,6 +247,8 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR
return experiment_param.summary_params.proportion_not_updated
elseif value_type == proportion_deleted
return experiment_param.summary_params.proportion_deleted
elseif value_type == deg_stat_type
return experiment_param.summary_params.deg_stats_type
else
# default to grouping by technique
return experiment_param.summary_params.partitioning_scheme
Expand Down
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Probably = "2172800d-0309-5a57-a84f-d50c94757422"
QuasiStableColors = "9c3856af-3e7c-4d34-a6af-a406867b22e4"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
Expand Down
31 changes: 28 additions & 3 deletions Source/CardinalityWithColors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,45 @@ StartEndColorPair = Tuple{Color, Color}
colors::StartEndColorPair
end

@enum PARTITIONER QuasiStable Hash Degree DirectedDegree SimpleLabel InOut LabelInOut NeighborEdges MostNeighbors NeighborNodeLabels
@enum PARTITIONER QuasiStable Hash Degree NeighborNodeLabels

function partitioner_to_string(x::PARTITIONER)
return if x == QuasiStable
"QS"
elseif x == Hash
"H"
elseif x == Degree
"D"
elseif x == NeighborNodeLabels
"NNL"
end
end

PartitioningScheme = Vector{Tuple{PARTITIONER, Int}}

function Base.show(io::IO, x::Vector{Tuple{PARTITIONER, Int}})
output = "["
prefix = ""
for (p, n) in x
output*= prefix * partitioner_to_string(p) * ":" * string(n)
prefix = ";"
end
output *= "]"
show(io, output)
end

struct ColorSummaryParams
deg_stats_type::Type
num_colors::Int
max_cycle_size::Int
max_partial_paths::Int
partitioning_scheme::Vector{Tuple{PARTITIONER, Int}}
partitioning_scheme::PartitioningScheme
weighting::Bool
proportion_not_updated::Float16
proportion_deleted::Float16

function ColorSummaryParams(;deg_stats_type = AvgDegStats, max_cycle_size=4, max_partial_paths=1000,
partitioning_scheme::Vector{Tuple{PARTITIONER, Int}} = [(QuasiStable, 64)], weighting=true, proportion_not_updated = 1.0, proportion_deleted=0.0)
partitioning_scheme::PartitioningScheme = [(QuasiStable, 64)], weighting=true, proportion_not_updated = 1.0, proportion_deleted=0.0)
num_colors = sum([x[2] for x in partitioning_scheme])
return new(deg_stats_type, num_colors, max_cycle_size, max_partial_paths, partitioning_scheme, weighting, proportion_not_updated, proportion_deleted)
end
Expand Down
10 changes: 5 additions & 5 deletions Source/ColorSummary.jl
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ function add_summary_node!(summary::ColorSummary{AvgDegStats}, node_labels, node
summary.edge_deg[edge_label][node_label] = Dict()
end
for other_color in keys(summary.edge_deg[edge_label][node_label])
current_ds = get(summary.edge_deg[edge_label][node_label][other_color], color, AvgDegStats(0.0))
current_ds = get(summary.edge_deg[edge_label][node_label][other_color], color, AvgDegStats(0, 0))
current_cardinality = get(summary.color_label_cardinality[color], node_label, 0)
avg_in = current_ds.avg_in * (current_cardinality / (current_cardinality + 1))
avg_out = current_ds.avg_out * (current_cardinality / (current_cardinality + 1))
Expand Down Expand Up @@ -96,7 +96,7 @@ function delete_summary_node!(summary::ColorSummary{AvgDegStats}, node_labels, n
for node_label in node_labels
for other_color in keys(summary.edge_deg[edge_label][node_label])
current_cardinality = get(summary.color_label_cardinality[color], node_label, 0)
current_deg = get(summary.edge_deg[edge_label][node_label][other_color], color, DS(0.0))
current_deg = get(summary.edge_deg[edge_label][node_label][other_color], color, AvgDegStats(0, 0))
scale_factor = current_cardinality <= 1 ? 0 : (current_cardinality / (current_cardinality - 1))
summary.edge_deg[edge_label][node_label][other_color][color] = AvgDegStats(current_deg.avg_in*scale_factor, current_deg.avg_out*scale_factor)
end
Expand Down Expand Up @@ -159,7 +159,7 @@ function update_edge_degrees!(summary::ColorSummary{AvgDegStats}, start_node, en
if !haskey(summary.edge_deg[edge_label][vertex_label], start_color)
summary.edge_deg[edge_label][vertex_label][start_color] = Dict()
end
current_deg = get(summary.edge_deg[edge_label][vertex_label][start_color], end_color, DS(0.0))
current_deg = get(summary.edge_deg[edge_label][vertex_label][start_color], end_color, AvgDegStats(0,0))
original_avg_out = current_deg.avg_out
new_avg_out = c1_count == 0 ? 0 :
min(((original_avg_out * c1_count) + probability_end_vertex_label), c1_count * summary.color_label_cardinality[end_color][vertex_label]) / c1_count
Expand Down Expand Up @@ -187,7 +187,7 @@ function update_edge_degrees!(summary::ColorSummary{AvgDegStats}, start_node, en
if !haskey(summary.edge_deg[edge_label][vertex_label], end_color)
summary.edge_deg[edge_label][vertex_label][end_color] = Dict()
end
current_deg = get(summary.edge_deg[edge_label][vertex_label][end_color], start_color, DS(0.0))
current_deg = get(summary.edge_deg[edge_label][vertex_label][end_color], start_color, AvgDegStats(0, 0))
original_avg_in = current_deg.avg_in
new_avg_in = c2_count == 0 ? 0 :
min(((original_avg_in * c2_count) + probability_start_vertex_label), c2_count * summary.color_label_cardinality[start_color][vertex_label]) / c2_count
Expand All @@ -208,7 +208,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
color_filters::Dict{Color, SmallCuckoo} = Dict()
color_label_cardinality::Dict{Color, Any} = Dict()
color_hash::Dict{NodeId, Color} = color_graph(g, params)
num_colors = maximum(values(color_hash))
num_colors = maximum(values(color_hash); init = 0)
color_sizes = [0 for _ in 1:num_colors]
for c in values(color_hash)
color_sizes[c] += 1
Expand Down
3 changes: 3 additions & 0 deletions Source/ColoringMethods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,9 @@ end


function color_graph(g::DataGraph, params::ColorSummaryParams)
if nv(g.graph) == 0
return Dict()
end
color_hash::Dict{NodeId, Color} = Dict(i => 1 for i in 1:nv(g.graph))
for (partitioner, num_colors) in params.partitioning_scheme
color_hash = if partitioner == QuasiStable
Expand Down
67 changes: 15 additions & 52 deletions Source/DegreeStats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,15 @@ end
function get_out_deg_estimate(d::DegreeStats)
throw(ErrorException("DegreeStats is an abstract type, you probably meant to call a particular instance."))
end

function scale_out_deg(d::DegreeStats, s::Float64)
function get_in_deg_estimate(d::DegreeStats)
throw(ErrorException("DegreeStats is an abstract type, you probably meant to call a particular instance."))
end

function scale_in_deg(d::DegreeStats, s::Float64)
throw(ErrorException("DegreeStats is an abstract type, you probably meant to call a particular instance."))
end


abstract type StatAccumulator end

# Every accumulator needs a way to be initialized to represent a single starting color based
# on the cardinality of that color.
function StatAccumulator(c::Float64)
function StatAccumulator(c)
throw(ErrorException("StatAccumulator is an abstract type, you probably meant to call a particular instance."))
end

Expand All @@ -47,7 +41,7 @@ end

# Multiplication of this kind happens during sampling and handle_extra_edges to scale the
# weight of a particular path up or down.
function scale_coloring(w::StatAccumulator, s::Float64)
function scale_coloring(w::StatAccumulator, s)
throw(ErrorException("StatAccumulator is an abstract type, you probably meant to call a particular instance."))
end

Expand All @@ -72,8 +66,8 @@ end
############################ MinDegStats ################################################

struct MinDegStats <:DegreeStats
min_in::Float64
min_out::Float64
min_in::Float32
min_out::Float32
end

function MinDegStats(g::DataGraph, edges::Vector{Tuple{NodeId, NodeId, Bool}}, color_size::Int)
Expand All @@ -92,28 +86,15 @@ end
get_in_deg_estimate(d::MinDegStats) = d.min_in
get_out_deg_estimate(d::MinDegStats) = d.min_out


# In case p is negative, we to reduce the min degree. Otherwise, keep it.
function add_in_deg(d::MinDegStats, p::Float64, c1::Float64, c2::Float64)
return MinDegStats(max(0, min(d.min_in + ceil(p), d.min_in)), d.min_out)
end
function add_out_deg(d::MinDegStats, p::Float64, c1::Float64, c2::Float64)
return MinDegStats(d.min_in, max(0, min(d.min_out + ceil(p), d.min_out)))
end


struct MinAccumulator <:StatAccumulator
weight::Float64
MinAccumulator(c::Union{Float64, Int64}) = new(c)
weight::Float32
end
get_count(w::MinAccumulator) = w.weight
sum_colorings(w1::MinAccumulator, w2::MinAccumulator) = MinAccumulator(w1.weight + w2.weight)



# Because the minimum degree estimator aims to produce a lower bound, we generally can't
# scale up weights during sampling and we have to treat cycle closure as 0 probability.
function scale_coloring(w::MinAccumulator, s::Float64)
function scale_coloring(w::MinAccumulator, s)
if s >= 1.0
return w
else
Expand All @@ -133,8 +114,8 @@ end
############################ AvgDegStats ################################################

struct AvgDegStats <:DegreeStats
avg_in::Float64
avg_out::Float64
avg_in::Float32
avg_out::Float32
end

function AvgDegStats(g::DataGraph, edges::Vector{Tuple{NodeId, NodeId, Bool}}, color_size::Int)
Expand All @@ -155,22 +136,14 @@ end
get_in_deg_estimate(d::AvgDegStats) = d.avg_in
get_out_deg_estimate(d::AvgDegStats) = d.avg_out

function add_in_deg(d::AvgDegStats, p::Float64, c1::Float64, c2::Float64)
return AvgDegStats(max(0, min(d.avg_in + p/c1, c2)), d.avg_out)
end

function add_out_deg(d::AvgDegStats, p::Float64, c1::Float64, c2::Float64)
return AvgDegStats(d.avg_in, max(0, min(d.avg_out + p/c1, c2)))
end

struct AvgAccumulator <:StatAccumulator
weight::Float64
AvgAccumulator(c::Union{Float64, Int64}) = new(c)
weight::Float32
end

get_count(w::AvgAccumulator) = w.weight
sum_colorings(w1::AvgAccumulator, w2::AvgAccumulator) = AvgAccumulator(w1.weight + w2.weight)
scale_coloring(w::AvgAccumulator, s::Float64) = AvgAccumulator(w.weight * s)
scale_coloring(w::AvgAccumulator, s) = AvgAccumulator(w.weight * s)

function extend_coloring(w::AvgAccumulator, d::AvgDegStats, out_edge::Bool)
if out_edge
Expand All @@ -183,8 +156,8 @@ end
############################ MaxDegStats ################################################

struct MaxDegStats <:DegreeStats
max_in::Float64
max_out::Float64
max_in::Float32
max_out::Float32
end

function MaxDegStats(g::DataGraph, edges::Vector{Tuple{NodeId, NodeId, Bool}}, color_size::Int)
Expand All @@ -203,25 +176,15 @@ end
get_in_deg_estimate(d::MaxDegStats) = d.max_in
get_out_deg_estimate(d::MaxDegStats) = d.max_out

# In case p is negative, we make sure to never reduce the max degree
function add_in_deg(d::MaxDegStats, p::Float64, c1::Float64, c2::Float64)
return MaxDegStats(min(max(d.max_in, d.max_in + ceil(p)), c2), d.max_out)
end

function add_out_deg(d::MaxDegStats, p::Float64, c1::Float64, c2::Float64)
return MaxDegStats(d.max_in, min(max(d.max_out, d.max_out) + ceil(p), c2))
end

struct MaxAccumulator <:StatAccumulator
weight::Float64
MaxAccumulator(c::Union{Float64, Int64}) = new(c)
weight::Float32
end
get_count(w::MaxAccumulator) = w.weight
sum_colorings(w1::MaxAccumulator, w2::MaxAccumulator) = MaxAccumulator(w1.weight + w2.weight)

# Because the max degree estimator aims to produce an upper bound, we have to treat cycle
# closure as 1.0 probability.
function scale_coloring(w::MaxAccumulator, s::Float64)
function scale_coloring(w::MaxAccumulator, s)
if s >= 1.0
return MaxAccumulator(w.weight * s)
else
Expand Down
10 changes: 5 additions & 5 deletions Source/QuasiStableCardinalityEstimator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
end
deleteat!(current_query_nodes, nodeIdx)
partial_paths = zeros(Color, length(current_query_nodes), length(keys(new_partial_paths)))
partial_weights = fill(W(0.0), length(keys(new_partial_paths)))
partial_weights = fill(W(0), length(keys(new_partial_paths)))

path_idx = 1
for path in keys(new_partial_paths)
Expand All @@ -43,7 +43,7 @@ function sample_paths(partial_paths::Matrix{Color}, partial_weights::Vector{W},
# if we want to sample more paths than there are existing nonzero paths,
# then just return the original partial paths
new_partial_paths = zeros(Color, size(partial_paths))
new_partial_weights = fill(W(0.0), size(partial_weights))
new_partial_weights = fill(W(0), size(partial_weights))
new_path_idx = 1
for i in eachindex(partial_weights)
if get_count(partial_weights[i]) > 0
Expand Down Expand Up @@ -80,7 +80,7 @@ function sample_paths(partial_paths::Matrix{Color}, partial_weights::Vector{W},
sampled_bounds_sum += get_count(new_partial_weights[idx])
end
sampled_partial_paths = zeros(Color, size(new_partial_paths)[1], length(sample_indices))
sampled_partial_weights = fill(W(0.0), length(sample_indices))
sampled_partial_weights = fill(W(0), length(sample_indices))

for i in eachindex(sample_indices)
idx = sample_indices[i]
Expand Down Expand Up @@ -292,7 +292,7 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary{DS}; ma
# we don't have to keep the label in the partial paths object.
num_colors = summary.num_colors
partial_paths = zeros(Color, 1, num_colors) # each tuple contains a pairing of color paths -> bounds
partial_weights = fill(W(0.0), num_colors)
partial_weights = fill(W(0), num_colors)
visited_query_edges::Vector{Tuple{Int,Int}} = []
current_query_nodes::Vector{Int} = []

Expand Down Expand Up @@ -369,7 +369,7 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary{DS}; ma
new_data_labels = get_data_label(query, new_node)
num_current_paths = size(partial_paths)[2]
new_partial_paths = zeros(Color, length(current_query_nodes), num_current_paths * num_colors)
new_partial_weights = fill(W(0.0), num_current_paths * num_colors)
new_partial_weights = fill(W(0), num_current_paths * num_colors)
# Update the partial paths using the parent-child combo that comes next from the query.
edge_deg::Dict{Color, Dict{Color, DS}} = Dict()
if haskey(summary.edge_deg, edge_label) &&
Expand Down

0 comments on commit 4593325

Please sign in to comment.