Skip to content

Commit

Permalink
Merge pull request #48 from mkyl/kbd-flexible-deg-stats
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebd99 authored Dec 7, 2023
2 parents 6929150 + e27140d commit f42c40d
Show file tree
Hide file tree
Showing 14 changed files with 493 additions and 329 deletions.
1 change: 1 addition & 0 deletions Experiments/Experiments.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ using CSV, DataFrames
using Parquet2: Dataset
using DelimitedFiles: writedlm
using BenchmarkTools
using Random

include("../Source/CardinalityWithColors.jl")
include("utils.jl")
Expand Down
6 changes: 3 additions & 3 deletions Experiments/Scripts/coloring_methods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
using Profile
include("../Experiments.jl")

datasets = [hprd]
datasets = [aids]
partitioning_schemes = [
[(Degree, 64)],
[(NeighborNodeLabels, 64)],
Expand All @@ -11,15 +11,15 @@ partitioning_schemes = [
[(Hash, 64)],
[(Degree, 8), (QuasiStable, 32), (NeighborNodeLabels, 24)],
[(Degree, 8), (NeighborNodeLabels, 24), (QuasiStable, 32)]]

partitioning_schemes = [[(QuasiStable, 64)]]
experiment_params = Vector{ExperimentParams}()
for dataset in datasets
for scheme in partitioning_schemes
push!(experiment_params, ExperimentParams(dataset=dataset, partitioning_scheme=scheme))
end
end

build_experiments(experiment_params)
#build_experiments(experiment_params)

run_estimation_experiments(experiment_params)

Expand Down
4 changes: 2 additions & 2 deletions Experiments/Scripts/memory_exps.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ experiment_params = Vector{ExperimentParams}()
build_params = Vector{ExperimentParams}()
for dataset in datasets
for n in num_colors
push!(build_params, ExperimentParams(dataset=dataset, num_colors=n))
push!(build_params, ExperimentParams(dataset=dataset, partitioning_scheme=[(QuasiStable, n)]))
end
end
build_experiments(build_params)

graph_grouped_bar_plot(build_params; grouping=number_of_colors,
y_type=memory_footprint,
y_lims=[0, 16],
filename="memory_size_vs_colors_fp32_int16")
filename="memory_size_vs_colors_fp32_int16_2")
14 changes: 6 additions & 8 deletions Experiments/Scripts/update_experiments.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
using Plots.PlotMeasures
using Graphs
include("../Experiments.jl")

# datasets::Vector{DATASET} = [wordnet]
datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
datasets::Vector{DATASET} = [wordnet]
# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
max_cycles = 6
proportions_not_updated = [0, 0.2, 0.4, 0.6, 0.8, 1]
proportions_not_updated = [0.05, 0.2, 0.4, 0.6, 0.8, 1]

experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_cycle, proportion_not_updated=current_proportion)
for current_dataset in datasets for current_cycle in 2:max_cycles for current_proportion in proportions_not_updated]
experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, max_cycle_size=current_cycle, proportion_not_updated=current_proportion)
for current_dataset in datasets for current_cycle in 6:max_cycles for current_proportion in proportions_not_updated]
println("started building")
build_experiments(experiment_params_list)
println("started estimating")
Expand All @@ -18,4 +16,4 @@ println("started graphing")
# compare how overall accuracy is affected by summary updates
graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=proportion_not_updated, filename="overall-accuracy-and-updates")
# compare how cycle stat accuracies are affected by summary updates
graph_grouped_box_plot(experiment_params_list, x_type=proportion_not_updated, y_type=estimate_error, grouping=cycle_size, filename="cycle-stats-and-updates")
graph_grouped_box_plot(experiment_params_list, x_type=proportion_not_updated, y_type=estimate_error, grouping=cycle_size, filename="cycle-stats-and-updates")
4 changes: 3 additions & 1 deletion Experiments/graph_results.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase proportion_not_updated proportion_deleted
@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase proportion_not_updated proportion_deleted deg_stat_type

@enum VALUE estimate_error runtime build_time memory_footprint

Expand Down Expand Up @@ -247,6 +247,8 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR
return experiment_param.summary_params.proportion_not_updated
elseif value_type == proportion_deleted
return experiment_param.summary_params.proportion_deleted
elseif value_type == deg_stat_type
return experiment_param.summary_params.deg_stats_type
else
# default to grouping by technique
return experiment_param.summary_params.partitioning_scheme
Expand Down
9 changes: 3 additions & 6 deletions Experiments/run_estimators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ function run_estimation_experiments(experiment_params_list::Vector{ExperimentPar
!isfile(summary_file_location) && error("The summary has not been built yet! \n Attempted File Location: $(summary_file_location)")
summary::ColorSummary = deserialize(summary_file_location)
experiment_results = []
push!(experiment_results, ("UpperBound", "Estimate", "LowerBound", "TrueCard", "EstimationTime", "QueryType", "QueryPath"))
push!(experiment_results, ("Estimate", "TrueCard", "EstimationTime", "QueryType", "QueryPath"))
for i in 1:length(all_queries[dataset])
query::QueryGraph = all_queries[dataset][i].query
query_path = all_queries[dataset][i].query_path
Expand All @@ -17,12 +17,9 @@ function run_estimation_experiments(experiment_params_list::Vector{ExperimentPar
sampling_strategy=experiment_params.sampling_strategy,
only_shortest_path_cycle= experiment_params.only_shortest_path_cycle)) for _ in 1:3]
estimate_time = median([x.time for x in estimate_results]) # Convert back to seconds from nano seconds
bounds = estimate_results[1].value
upper_bound = bounds[3]
estimate = max(1, bounds[2])
lower_bound = bounds[1]
estimate = max(1, estimate_results[1].value)
query_type = all_queries[dataset][i].query_type
push!(experiment_results, (upper_bound, estimate, lower_bound, exact_size, estimate_time, query_type, query_path))
push!(experiment_results, (estimate, exact_size, estimate_time, query_type, query_path))
end
results_file_location = "Experiments/Results/Estimation_" * params_to_results_filename(experiment_params)
writedlm(results_file_location, experiment_results, ",")
Expand Down
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Probably = "2172800d-0309-5a57-a84f-d50c94757422"
QuasiStableColors = "9c3856af-3e7c-4d34-a6af-a406867b22e4"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
Expand Down
47 changes: 33 additions & 14 deletions Source/CardinalityWithColors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,55 +8,74 @@ QSC = QuasiStableColors
using Graphs: SimpleDiGraphFromIterator, Edge, DiGraph, edges, nv, ne, add_edge!,
add_vertex!, vertices, all_neighbors, src, dst, outneighbors, inneighbors


BoolPath = Vector{Bool}
NodeId = Int
Color = Int16
StartEndColorPair = Tuple{Color, Color}
abstract type Comparable end
import Base .==
function ==(a::T, b::T) where T <: Comparable
(a.path == b.path) && (a.colors == b.colors)
end
@auto_hash_equals mutable struct CyclePathAndColors
path::BoolPath
colors::StartEndColorPair
end

@enum PARTITIONER QuasiStable Hash Degree DirectedDegree SimpleLabel InOut LabelInOut NeighborEdges MostNeighbors NeighborNodeLabels
@enum PARTITIONER QuasiStable Hash Degree NeighborNodeLabels

function partitioner_to_string(x::PARTITIONER)
return if x == QuasiStable
"QS"
elseif x == Hash
"H"
elseif x == Degree
"D"
elseif x == NeighborNodeLabels
"NNL"
end
end

PartitioningScheme = Vector{Tuple{PARTITIONER, Int}}

function Base.show(io::IO, x::Vector{Tuple{PARTITIONER, Int}})
output = "["
prefix = ""
for (p, n) in x
output*= prefix * partitioner_to_string(p) * ":" * string(n)
prefix = ";"
end
output *= "]"
show(io, output)
end

struct ColorSummaryParams
deg_stats_type::Type
num_colors::Int
max_cycle_size::Int
max_partial_paths::Int
partitioning_scheme::Vector{Tuple{PARTITIONER, Int}}
partitioning_scheme::PartitioningScheme
weighting::Bool
proportion_not_updated::Float16
proportion_deleted::Float16

function ColorSummaryParams(;max_cycle_size=4, max_partial_paths=1000,
partitioning_scheme::Vector{Tuple{PARTITIONER, Int}} = [(QuasiStable, 64)], weighting=true, proportion_not_updated = 1.0, proportion_deleted=0.0)
function ColorSummaryParams(;deg_stats_type = AvgDegStats, max_cycle_size=4, max_partial_paths=1000,
partitioning_scheme::PartitioningScheme = [(QuasiStable, 64)], weighting=true, proportion_not_updated = 1.0, proportion_deleted=0.0)
num_colors = sum([x[2] for x in partitioning_scheme])
return new(num_colors, max_cycle_size, max_partial_paths, partitioning_scheme, weighting, proportion_not_updated, proportion_deleted)
return new(deg_stats_type, num_colors, max_cycle_size, max_partial_paths, partitioning_scheme, weighting, proportion_not_updated, proportion_deleted)
end
end

function params_to_string(params::ColorSummaryParams)
summary_name = "ColorSummary_" * string(params.partitioning_scheme) * "_"
summary_name = "ColorSummary_" * string(params.deg_stats_type) * "_"
summary_name *= string(params.partitioning_scheme) * "_"
summary_name *= string(params.max_cycle_size) * "_"
summary_name *= string(params.max_partial_paths)* "_"
summary_name *= string(params.proportion_not_updated) * "_"
summary_name *= string(params.proportion_deleted)
return summary_name
end



include("PropertyGraph.jl")
include("datasets.jl")
include("utils.jl")
include("ExactSizeCalculator.jl")
include("ColoringMethods.jl")
include("DegreeStats.jl")
include("ColorSummary.jl")
include("QuasiStableCardinalityEstimator.jl")
Loading

0 comments on commit f42c40d

Please sign in to comment.