-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdsprofiling.R
71 lines (64 loc) · 2.13 KB
/
dsprofiling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#' # DsProfiling - Dataset Profiling
#' @author Gürol Canbek, <[email protected]>
#' Copyright (C) 2017-2018 Gürol CANBEK
#' This file is licensed under
#'
#' A p a c h e L i c e n s e 2 . 0
#'
#' A permissive license whose main conditions require preservation of copyright
#' and license notices. Contributors provide an express grant of patent rights.
#' Licensed works, modifications, and larger works may be distributed under
#' different terms and without source code.
#'
#' See the license file in <https://github.com/gurol/dsprofiling>
#' @references <http://gurol.canbek.com>
#' @keywords dataset, profiling, data quality, quanitative analysis, benchmark
#' @title Dataset Profiling
#' @version 1.0
#' @description R functions for calculating some of the profiling criteria for
#' the datasets
#' @note version history
#' 1.0, 17 June 2017, The first version
#' @date 17 June 2017
#' libraries
# None
# CSV datasets should be converted to columnar format
# dfPermVT2018 <- convertCsvToDataFrame(dfPermCsvVT2018, sep=',', filter=dfStandard)
#' ### replaceNotNaValues
#' Replace the NA or NaN values in a dataframe with given value
#' **Parameters:**
#' *df*: Input dataset
#' *new_value*: Value to be replaced for NA or NaN values
#' **Return:**
#' Replaced data frame
#' **Example Usage:**
# dfPermAMD <- replaceNotNaValues(dfPermAMD, TRUE)
replaceNotNaValues<-function(df, new_value)
{
df[!is.na(df)] <- new_value
return (df)
}
#' ### profileDatasetDensity
#' Return the density profiling criterion of a dataset
#' **Parameters:**
#' *df*: Input dataset
#' **Return:**
#' Calculated density
#' **Example Usage:**
# densityAMD <- profileDatasetDensity(dfPermAMD)
profileDatasetDensity<-function(df)
{
return (length(df[!is.na(df)])/(nrow(df)*ncol(df)))
}
#' ### profileDatasetSparsity
#' Return the sparsity profiling criterion of a dataset
#' **Parameters:**
#' *df*: Input dataset
#' **Return:**
#' Calculated sparsity
#' **Example Usage:**
# sparsityAMD <- profileDatasetSparsity(dfPermAMD)
profileDatasetSparsity<-function(df)
{
return (1-profileDatasetDensity(df))
}