Skip to content

Commit

Permalink
add histogram functions to sqlite
Browse files Browse the repository at this point in the history
log2_hist(input, bucket_count)
mode_hist(mode)
time_hist(timestamp, ref)
category_hist(string)

mode_count(string)
    returns both the mode and the count
    not stable - will only keep first of multiple values with the same count

added C functions to parse the output of these functions
modified trie to hold user data and free it when the trie is freed
added parent's pinode as ppinode to VRPENTRIES
  • Loading branch information
calccrypto committed Jan 19, 2024
1 parent 5f1e426 commit 1efe847
Show file tree
Hide file tree
Showing 15 changed files with 1,507 additions and 82 deletions.
2 changes: 1 addition & 1 deletion contrib/treediff.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ static int get_entries(DIR *dir, trie_t *skip_db,
const size_t len = strlen(entry->d_name);

/* skip . and .. and *.db */
const int skip = (trie_search(skip_db, entry->d_name, len) ||
const int skip = (trie_search(skip_db, entry->d_name, len, NULL) ||
((len >= 3) && (strncmp(entry->d_name + len - 3, ".db", 3) == 0)));
if (skip) {
continue;
Expand Down
220 changes: 220 additions & 0 deletions include/histogram.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
/*
This file is part of GUFI, which is part of MarFS, which is released
under the BSD license.
Copyright (c) 2017, Los Alamos National Security (LANS), LLC
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
From Los Alamos National Security, LLC:
LA-CC-15-039
Copyright (c) 2017, Los Alamos National Security, LLC All rights reserved.
Copyright 2017. Los Alamos National Security, LLC. This software was produced
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
the U.S. Department of Energy. The U.S. Government has rights to use,
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is
modified to produce derivative works, such modified software should be
clearly marked, so as not to confuse it with the version available from
LANL.
THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
OF SUCH DAMAGE.
*/



#ifndef GUFI_SQLITE3_HISTOGRAM_H
#define GUFI_SQLITE3_HISTOGRAM_H

#include <stddef.h>
#include <time.h>

#include <sqlite3.h>

#ifdef __cplusplus
extern "C" {
#endif

/* use this to add histogram functions to a sqlite database handle */
int addhistfuncs(sqlite3 *db);

/*
* Public API for parsing returned strings.
*
* These structs are intended for external use.
*/

/* ********************************************* */
/*
* log2 Histograms
* log2_hist(input, bucket_count) -> bucket_count;underflow;overflow;bucket1:count1;bucket2:count2;...
*
* Convert an input into a number and take the floor(log2()) of that
* value. Strings and blobs are converted to lengths. Integers and
* floats are not converted.
*
* The histogram contains buckets of rhe range [0, count). Each bucket
* holds counts for values from [2^i, 2^(i+1)). There are also
* underflow and overflow values for handling 0 values and values that
* are larger than the expected range.
*
* The returned string only contains buckets with counts greater than
* 0. The underflow and overflow counts are always returned even if
* the counts are 0.
*/
typedef struct log2_hist {
size_t count;
size_t lt; /* len == 0 */
size_t *buckets; /* integers range [0, count) - use floor(log2(len)) to get bucket to increment */
size_t ge; /* len >= 2^count */
} log2_hist_t;

log2_hist_t *log2_hist_parse(const char *str);
void log2_hist_free(log2_hist_t *hist);
/* ********************************************* */

/* ********************************************* */
/*
* Mode (Permission) Histograms
* mode_hist(mode) -> mode1:count1;mode2:count2;...
*
* Buckets are permission bits from 000 - 777.
*
* The returned string only contains permissions with counts greater
* than 0.
*/
typedef struct mode_hist {
size_t buckets[512];
} mode_hist_t;

mode_hist_t *mode_hist_parse(const char *str);
void mode_hist_free(mode_hist_t *hist);
/* ********************************************* */

/* ********************************************* */
/*
* Timestamp/Age Histograms
* time_hist(timestamp, ref) -> ref;seconds1:count1;seconds2:count2;...
*
* The buckets where timestamps are counted are predefined.
* The buckets represent [bucket[i - 1], bucket[i]) intervals.
*
* The returned string only contains timestamps with counts greater
* than 0.
*/
/* used to define TIME_BUCKETS */
typedef struct time_bucket {
const char name[16];
time_t seconds;
} time_bucket_t;

static const time_bucket_t TIME_BUCKETS[] = {
{"second", 1},
{"minute", 60},
{"hour", 3600},
{"day", 86400},
{"week", 604800},
{"four_weeks", 2419200},
{"year", 31536000},
{"years", 0}, /* overflow value - keep last */
};

#define TIME_BUCKETS_COUNT (sizeof(TIME_BUCKETS) / sizeof(TIME_BUCKETS[0]))

typedef struct time_hist {
size_t buckets[TIME_BUCKETS_COUNT];
time_t ref;
} time_hist_t;

time_hist_t *time_hist_parse(const char *str);
void time_hist_free(time_hist_t *hist);
/* ********************************************* */

/* ********************************************* */
/*
* Generic Category Histograms
* category_hist(string) -> category_count;len1:category1:count1;len2:category2:count2;...
*
* Categories are not predefined. Instead, they are generated as
* values are passed into category_hist. If none of the inputs match,
* every single input will be a different category.
*
* The returned string only contains categories with more than 1 count.
*/
typedef struct category_bucket {
char *name;
size_t len;
size_t count;
} category_bucket_t;

typedef struct category_hist {
category_bucket_t *buckets;
size_t count;
} category_hist_t;

category_hist_t *category_hist_parse(const char *str);
void category_hist_free(category_hist_t *hist);
/* ********************************************* */

/* ********************************************* */
/*
* Statistical Mode
* mode_count(string) -> len:string:count
*
* This is here because mode is implemented using
* categoriy histograms.
*/
typedef struct mode_count {
char *mode;
size_t len;
size_t count;
} mode_count_t;

mode_count_t *mode_count_parse(const char *str);
void mode_count_free(mode_count_t *mc);
/* ********************************************* */

#ifdef __cplusplus
}
#endif

#endif
6 changes: 4 additions & 2 deletions include/trie.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@ extern "C" {
typedef struct trie trie_t;

trie_t *trie_alloc(void);
void trie_insert(trie_t *head, const char* str, const size_t len);
int trie_search(trie_t *head, const char* str, const size_t len);
void trie_insert(trie_t *head, const char* str, const size_t len,
void *user_data, void (*free_user)(void *));
int trie_search(trie_t *head, const char* str, const size_t len,
void **user_data);
int trie_delete(trie_t *head, const char* str, const size_t len);
void trie_free(trie_t *head);

Expand Down
2 changes: 1 addition & 1 deletion src/BottomUp.c
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ static int descend_to_bottom(QPTPool_t *ctx, const size_t id, void *data, void *
}

size_t name_len = strlen(entry->d_name);
if (trie_search(ua->skip, entry->d_name, name_len) == 1) {
if (trie_search(ua->skip, entry->d_name, name_len, NULL) == 1) {
continue;
}

Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ set(GUFI_SOURCES
dbutils.c
debug.c
external.c
histogram.c
print.c
template_db.c
trace.c
Expand Down
6 changes: 3 additions & 3 deletions src/bfwreaddirplus2db.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ static int searchmyll(struct PoolArgs *pa, ino_t inode, const enum DFL type) {

char str[256];
const size_t len = SNPRINTF(str, sizeof(str), "%" STAT_ino, inode);
return trie_search(dst, str, len);
return trie_search(dst, str, len, NULL);
}

static int create_readdirplus_tables(const char *name, sqlite3 *db, void *args) {
Expand Down Expand Up @@ -560,7 +560,7 @@ static int processinit(struct PoolArgs *pa, QPTPool_t *ctx) {
if (testll > pa->glsuspectdmax) pa->glsuspectdmax = testll;
}

trie_insert(pa->dirs, incsuspect, strlen(incsuspect));
trie_insert(pa->dirs, incsuspect, strlen(incsuspect), NULL, NULL);
cntd++;
break;
case 'f': case 'l':
Expand All @@ -572,7 +572,7 @@ static int processinit(struct PoolArgs *pa, QPTPool_t *ctx) {
if (testll > pa->glsuspectflmax) pa->glsuspectflmax = testll;
}

trie_insert(pa->fls, incsuspect, strlen(incsuspect));
trie_insert(pa->fls, incsuspect, strlen(incsuspect), NULL, NULL);
cntfl++;
break;
}
Expand Down
44 changes: 23 additions & 21 deletions src/dbutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ OF SUCH DAMAGE.

#include "BottomUp.h"
#include "dbutils.h"
#include "histogram.h"

const char READDIRPLUS_CREATE[] =
DROP_TABLE(READDIRPLUS)
Expand Down Expand Up @@ -113,7 +114,7 @@ const char PENTRIES_CREATE[] =

const char VRPENTRIES_CREATE[] =
DROP_VIEW(VRPENTRIES)
"CREATE VIEW " VRPENTRIES " AS SELECT REPLACE(" SUMMARY ".name, RTRIM(" SUMMARY ".name, REPLACE(" SUMMARY ".name, '/', '')), '') AS dname, " SUMMARY ".name AS sname, " SUMMARY ".mode AS dmode, " SUMMARY ".nlink AS dnlink, " SUMMARY ".uid AS duid, " SUMMARY ".gid AS dgid, " SUMMARY ".size AS dsize, " SUMMARY ".blksize AS dblksize, " SUMMARY ".blocks AS dblocks, " SUMMARY ".atime AS datime, " SUMMARY ".mtime AS dmtime, " SUMMARY ".ctime AS dctime, " SUMMARY ".linkname AS dlinkname, " SUMMARY ".totfiles AS dtotfile, " SUMMARY ".totlinks AS dtotlinks, " SUMMARY ".minuid AS dminuid, " SUMMARY ".maxuid AS dmaxuid, " SUMMARY ".mingid AS dmingid, " SUMMARY ".maxgid AS dmaxgid, " SUMMARY ".minsize AS dminsize, " SUMMARY ".maxsize AS dmaxsize, " SUMMARY ".totzero AS dtotzero, " SUMMARY ".totltk AS dtotltk, " SUMMARY ".totmtk AS dtotmtk, " SUMMARY ".totltm AS totltm, " SUMMARY ".totmtm AS dtotmtm, " SUMMARY ".totmtg AS dtotmtg, " SUMMARY ".totmtt AS dtotmtt, " SUMMARY ".totsize AS dtotsize, " SUMMARY ".minctime AS dminctime, " SUMMARY ".maxctime AS dmaxctime, " SUMMARY ".minmtime AS dminmtime, " SUMMARY ".maxmtime AS dmaxmtime, " SUMMARY ".minatime AS dminatime, " SUMMARY ".maxatime AS dmaxatime, " SUMMARY ".minblocks AS dminblocks, " SUMMARY ".maxblocks AS dmaxblocks, " SUMMARY ".totxattr AS dtotxattr, " SUMMARY ".depth AS ddepth, " SUMMARY ".mincrtime AS dmincrtime, " SUMMARY ".maxcrtime AS dmaxcrtime, " SUMMARY ".rollupscore AS sroll, " SUMMARY ".isroot as atroot, " PENTRIES ".* FROM " SUMMARY ", " PENTRIES " WHERE " SUMMARY ".inode == " PENTRIES ".pinode;";
"CREATE VIEW " VRPENTRIES " AS SELECT REPLACE(" VRSUMMARY ".name, RTRIM(" VRSUMMARY ".name, REPLACE(" VRSUMMARY ".name, '/', '')), '') AS dname, " VRSUMMARY ".name AS sname, " VRSUMMARY ".mode AS dmode, " VRSUMMARY ".nlink AS dnlink, " VRSUMMARY ".uid AS duid, " VRSUMMARY ".gid AS dgid, " VRSUMMARY ".size AS dsize, " VRSUMMARY ".blksize AS dblksize, " VRSUMMARY ".blocks AS dblocks, " VRSUMMARY ".atime AS datime, " VRSUMMARY ".mtime AS dmtime, " VRSUMMARY ".ctime AS dctime, " VRSUMMARY ".linkname AS dlinkname, " VRSUMMARY ".totfiles AS dtotfile, " VRSUMMARY ".totlinks AS dtotlinks, " VRSUMMARY ".minuid AS dminuid, " VRSUMMARY ".maxuid AS dmaxuid, " VRSUMMARY ".mingid AS dmingid, " VRSUMMARY ".maxgid AS dmaxgid, " VRSUMMARY ".minsize AS dminsize, " VRSUMMARY ".maxsize AS dmaxsize, " VRSUMMARY ".totzero AS dtotzero, " VRSUMMARY ".totltk AS dtotltk, " VRSUMMARY ".totmtk AS dtotmtk, " VRSUMMARY ".totltm AS totltm, " VRSUMMARY ".totmtm AS dtotmtm, " VRSUMMARY ".totmtg AS dtotmtg, " VRSUMMARY ".totmtt AS dtotmtt, " VRSUMMARY ".totsize AS dtotsize, " VRSUMMARY ".minctime AS dminctime, " VRSUMMARY ".maxctime AS dmaxctime, " VRSUMMARY ".minmtime AS dminmtime, " VRSUMMARY ".maxmtime AS dmaxmtime, " VRSUMMARY ".minatime AS dminatime, " VRSUMMARY ".maxatime AS dmaxatime, " VRSUMMARY ".minblocks AS dminblocks, " VRSUMMARY ".maxblocks AS dmaxblocks, " VRSUMMARY ".totxattr AS dtotxattr, " VRSUMMARY ".depth AS ddepth, " VRSUMMARY ".mincrtime AS dmincrtime, " VRSUMMARY ".maxcrtime AS dmaxcrtime, " VRSUMMARY ".pinode AS ppinode, " VRSUMMARY ".rollupscore AS sroll, " VRSUMMARY ".isroot as atroot, " VRSUMMARY ".srollsubdirs as srollsubdirs, " PENTRIES ".* FROM " VRSUMMARY ", " PENTRIES " WHERE " VRSUMMARY ".inode == " PENTRIES ".pinode;";

const char TREESUMMARY_EXISTS[] =
"SELECT name FROM sqlite_master WHERE (type == 'table') AND (name == '" TREESUMMARY "');";
Expand Down Expand Up @@ -1210,26 +1211,27 @@ static void median_final(sqlite3_context *context) {

int addqueryfuncs(sqlite3 *db) {
return !(
(sqlite3_create_function(db, "uidtouser", 1, SQLITE_UTF8,
NULL, &uidtouser, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "gidtogroup", 1, SQLITE_UTF8,
NULL, &gidtogroup, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "modetotxt", 1, SQLITE_UTF8,
NULL, &modetotxt, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "strftime", 2, SQLITE_UTF8,
NULL, &sqlite3_strftime, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "blocksize", 2, SQLITE_UTF8,
NULL, &blocksize, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "human_readable_size", 1, SQLITE_UTF8,
NULL, &human_readable_size, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "basename", 1, SQLITE_UTF8,
NULL, &sqlite_basename, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "stdevs", 1, SQLITE_UTF8,
NULL, NULL, stdev_step, stdevs_final) == SQLITE_OK) &&
(sqlite3_create_function(db, "stdevp", 1, SQLITE_UTF8,
NULL, NULL, stdev_step, stdevp_final) == SQLITE_OK) &&
(sqlite3_create_function(db, "median", 1, SQLITE_UTF8,
NULL, NULL, median_step, median_final) == SQLITE_OK)
(sqlite3_create_function(db, "uidtouser", 1, SQLITE_UTF8,
NULL, &uidtouser, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "gidtogroup", 1, SQLITE_UTF8,
NULL, &gidtogroup, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "modetotxt", 1, SQLITE_UTF8,
NULL, &modetotxt, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "strftime", 2, SQLITE_UTF8,
NULL, &sqlite3_strftime, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "blocksize", 2, SQLITE_UTF8,
NULL, &blocksize, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "human_readable_size", 1, SQLITE_UTF8,
NULL, &human_readable_size, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "basename", 1, SQLITE_UTF8,
NULL, &sqlite_basename, NULL, NULL) == SQLITE_OK) &&
(sqlite3_create_function(db, "stdevs", 1, SQLITE_UTF8,
NULL, NULL, stdev_step, stdevs_final) == SQLITE_OK) &&
(sqlite3_create_function(db, "stdevp", 1, SQLITE_UTF8,
NULL, NULL, stdev_step, stdevp_final) == SQLITE_OK) &&
(sqlite3_create_function(db, "median", 1, SQLITE_UTF8,
NULL, NULL, median_step, median_final) == SQLITE_OK) &&
addhistfuncs(db)
);
}

Expand Down
2 changes: 1 addition & 1 deletion src/gufi_query/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ static size_t descend2(QPTPool_t *ctx,

descend_timestamp_start(dts, strncmp_call);
size_t len = strlen(entry->d_name);
const int skip = (trie_search(skip_names, entry->d_name, len) ||
const int skip = (trie_search(skip_names, entry->d_name, len, NULL) ||
(strncmp(entry->d_name + len - 3, ".db", 3) == 0));
descend_timestamp_end(strncmp_call);

Expand Down
Loading

0 comments on commit 1efe847

Please sign in to comment.