Skip to content

Commit

Permalink
update min/max level to be closer to how find(1) works
Browse files Browse the repository at this point in the history
level comparisons are now min <= level <= max
however, GUFI files/links are in the same level as the directory, so they will show up when they won't for find results
    - cannot put files/links into directories that were not indexed because the db.db file will not exist
        - if the db.db file were to be created, the summary table would have to be cleared, making db.db invalid

added gufi_dir2index -y
updated tests
  • Loading branch information
calccrypto committed Feb 27, 2025
1 parent 660cdc9 commit 846a4fb
Show file tree
Hide file tree
Showing 15 changed files with 458 additions and 294 deletions.
13 changes: 7 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -234,12 +234,13 @@ endfunction()
find_exec_dependency(TRUNCATE truncate)

# not from coreutils, but still very common
find_exec_dependency(AWK awk gawk)
find_exec_dependency(DIFF colordiff diff)
find_exec_dependency(FIND find)
find_exec_dependency(GREP grep)
find_exec_dependency(PATCH patch)
find_exec_dependency(SED sed)
find_exec_dependency(AWK awk gawk)
find_exec_dependency(COLUMN column)
find_exec_dependency(DIFF colordiff diff)
find_exec_dependency(FIND find)
find_exec_dependency(GREP grep)
find_exec_dependency(PATCH patch)
find_exec_dependency(SED sed)
if(APPLE)
find_exec_dependency(XATTR xattr)
else()
Expand Down
1 change: 1 addition & 0 deletions contrib/CI/ubuntu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ apt -y install \
apt -y install \
attr \
autoconf \
bsdmainutils \
clang \
cmake \
git \
Expand Down
88 changes: 52 additions & 36 deletions src/descend.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,11 @@ int descend(QPTPool_t *ctx, const size_t id, void *args,
struct descend_counters ctrs;
memset(&ctrs, 0, sizeof(ctrs));

if (work->level < in->max_level) {
/*
* check current level because files/links are
* in the same level as the directory
*/
if (work->level <= in->max_level) {
/* calculate once */
const size_t next_level = work->level + 1;
const size_t recursion_level = work->recursion_level + 1;
Expand Down Expand Up @@ -159,35 +163,44 @@ int descend(QPTPool_t *ctx, const size_t id, void *args,

/* push subdirectories onto the queue */
if (S_ISDIR(child_ed.statuso.st_mode)) {
child_ed.type = 'd';
/*
* check the next level because the subdirectory is
* NOT in the same level as the parent directory
*/
if (next_level <= in->max_level) {
child_ed.type = 'd';

if (!in->subdir_limit || (ctrs.dirs < in->subdir_limit)) {
struct work *copy = compress_struct(in->compress, child, struct_work_size(child));
QPTPool_enqueue_swappable(ctx, id, processdir, copy,
work_serialize_and_free, QPTPool_generic_alloc_and_deserialize);
}
else {
/*
* If this directory has too many subdirectories,
* process the current subdirectory here instead
* of enqueuing it. This only allows for one
* subdirectory work item to be allocated at a
* time instead of all of them, reducing overall
* memory usage. This branch is only applied at
* this level, so small subdirectories will still
* enqueue work, and large subdirectories will
* still enqueue some work and process the
* remaining in-situ.
*
* Return value should probably be used.
*/
child->recursion_level = recursion_level;
processdir(ctx, id, child, args);
ctrs.dirs_insitu++;
}

if (!in->subdir_limit || (ctrs.dirs < in->subdir_limit)) {
struct work *copy = compress_struct(in->compress, child, struct_work_size(child));
QPTPool_enqueue_swappable(ctx, id, processdir, copy,
work_serialize_and_free, QPTPool_generic_alloc_and_deserialize);
ctrs.dirs++;
}
else {
/*
* If this directory has too many subdirectories,
* process the current subdirectory here instead
* of enqueuing it. This only allows for one
* subdirectory work item to be allocated at a
* time instead of all of them, reducing overall
* memory usage. This branch is only applied at
* this level, so small subdirectories will still
* enqueue work, and large subdirectories will
* still enqueue some work and process the
* remaining in-situ.
*
* Return value should probably be used.
*/
child->recursion_level = recursion_level;
processdir(ctx, id, child, args);
ctrs.dirs_insitu++;
/* skip enqueuing and just free */
free(child);
}

ctrs.dirs++;

continue;
}
/* non directories */
Expand All @@ -208,18 +221,21 @@ int descend(QPTPool_t *ctx, const size_t id, void *args,

ctrs.nondirs++;

if (processnondir) {
if (in->process_xattrs) {
xattrs_setup(&child_ed.xattrs);
xattrs_get(child->name, &child_ed.xattrs);
}
/* if this directory was processed, process the files/links */
if (in->min_level <= work->level) {
if (processnondir) {
if (in->process_xattrs) {
xattrs_setup(&child_ed.xattrs);
xattrs_get(child->name, &child_ed.xattrs);
}

child_ed.parent_fd = d_fd;
processnondir(child, &child_ed, nondir_args);
ctrs.nondirs_processed++;
child_ed.parent_fd = d_fd;
processnondir(child, &child_ed, nondir_args);
ctrs.nondirs_processed++;

if (in->process_xattrs) {
xattrs_cleanup(&child_ed.xattrs);
if (in->process_xattrs) {
xattrs_cleanup(&child_ed.xattrs);
}
}
}

Expand Down
137 changes: 76 additions & 61 deletions src/gufi_dir2index.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ struct PoolArgs {
struct template_db db;
struct template_db xattr;

uint64_t *total_files;
uint64_t *total_dirs;
uint64_t *total_nondirs;
};

struct NonDirArgs {
Expand Down Expand Up @@ -192,6 +193,9 @@ static int processdir(QPTPool_t *ctx, const size_t id, void *data, void *args) {

decompress_work(&nda.work, data);

const int process_dbdb = ((pa->in.min_level <= nda.work->level) &&
(nda.work->level <= pa->in.max_level));

if (lstat(nda.work->name, &nda.ed.statuso) != 0) {
const int err = errno;
fprintf(stderr, "Error: Could not stat directory \"%s\": %s (%d)\n", nda.work->name, strerror(err), err);
Expand Down Expand Up @@ -234,71 +238,77 @@ static int processdir(QPTPool_t *ctx, const size_t id, void *data, void *args) {
}
}

/* restore "/db.db" */
nda.topath[nda.topath_len] = '/';
if (process_dbdb) {
/* restore "/db.db" */
nda.topath[nda.topath_len] = '/';

nda.db = template_to_db(nda.temp_db, nda.topath, nda.ed.statuso.st_uid, nda.ed.statuso.st_gid);
nda.db = template_to_db(nda.temp_db, nda.topath, nda.ed.statuso.st_uid, nda.ed.statuso.st_gid);

/* remove "/db.db" */
nda.topath[nda.topath_len] = '\0';
/* remove "/db.db" */
nda.topath[nda.topath_len] = '\0';

if (!nda.db) {
rc = 1;
goto cleanup;
}
if (!nda.db) {
rc = 1;
goto cleanup;
}

/* prepare to insert into the database */
zeroit(&nda.summary);
/* prepare to insert into the database */
zeroit(&nda.summary);

/* prepared statements within db.db */
nda.entries_res = insertdbprep(nda.db, ENTRIES_INSERT);
nda.xattrs_res = NULL;
nda.xattr_files_res = NULL;
/* prepared statements within db.db */
nda.entries_res = insertdbprep(nda.db, ENTRIES_INSERT);
nda.xattrs_res = NULL;
nda.xattr_files_res = NULL;

if (nda.in->process_xattrs) {
nda.xattrs_res = insertdbprep(nda.db, XATTRS_PWD_INSERT);
nda.xattr_files_res = insertdbprep(nda.db, EXTERNAL_DBS_PWD_INSERT);
if (nda.in->process_xattrs) {
nda.xattrs_res = insertdbprep(nda.db, XATTRS_PWD_INSERT);
nda.xattr_files_res = insertdbprep(nda.db, EXTERNAL_DBS_PWD_INSERT);

/* external per-user and per-group dbs */
sll_init(&nda.xattr_db_list);
/* external per-user and per-group dbs */
sll_init(&nda.xattr_db_list);
}

startdb(nda.db);
}

struct descend_counters ctrs;
startdb(nda.db);
descend(ctx, id, pa, nda.in, nda.work, nda.ed.statuso.st_ino, dir, 0,
processdir, process_nondir, &nda,
&ctrs);
stopdb(nda.db);

/* entries and xattrs have been inserted */
if (process_dbdb) {
stopdb(nda.db);

if (nda.in->process_xattrs) {
/* write out per-user and per-group xattrs */
sll_destroy(&nda.xattr_db_list, destroy_xattr_db);
/* entries and xattrs have been inserted */

/* keep track of per-user and per-group xattr dbs */
insertdbfin(nda.xattr_files_res);
if (nda.in->process_xattrs) {
/* write out per-user and per-group xattrs */
sll_destroy(&nda.xattr_db_list, destroy_xattr_db);

/* pull this directory's xattrs because they were not pulled by the parent */
xattrs_setup(&nda.ed.xattrs);
xattrs_get(nda.work->name, &nda.ed.xattrs);
/* keep track of per-user and per-group xattr dbs */
insertdbfin(nda.xattr_files_res);

/* directory xattrs go into the same table as entries xattrs */
insertdbgo_xattrs_avail(&nda.ed, nda.xattrs_res);
insertdbfin(nda.xattrs_res);
}
insertdbfin(nda.entries_res);

/* insert this directory's summary data */
/* the xattrs go into the xattrs_avail table in db.db */
insertsumdb(nda.db, nda.work->name + nda.work->name_len - nda.work->basename_len,
nda.work, &nda.ed, &nda.summary);
if (nda.in->process_xattrs) {
xattrs_cleanup(&nda.ed.xattrs);
}
/* pull this directory's xattrs because they were not pulled by the parent */
xattrs_setup(&nda.ed.xattrs);
xattrs_get(nda.work->name, &nda.ed.xattrs);

closedb(nda.db);
nda.db = NULL;
/* directory xattrs go into the same table as entries xattrs */
insertdbgo_xattrs_avail(&nda.ed, nda.xattrs_res);
insertdbfin(nda.xattrs_res);
}
insertdbfin(nda.entries_res);

/* insert this directory's summary data */
/* the xattrs go into the xattrs_avail table in db.db */
insertsumdb(nda.db, nda.work->name + nda.work->name_len - nda.work->basename_len,
nda.work, &nda.ed, &nda.summary);
if (nda.in->process_xattrs) {
xattrs_cleanup(&nda.ed.xattrs);
}

closedb(nda.db);
nda.db = NULL;
}

/* ignore errors */
chmod(nda.topath, nda.ed.statuso.st_mode);
Expand All @@ -307,11 +317,14 @@ static int processdir(QPTPool_t *ctx, const size_t id, void *data, void *args) {
cleanup:
closedir(dir);

if (process_dbdb) {
pa->total_dirs[id]++;
pa->total_nondirs[id] += ctrs.nondirs_processed;
}

free(nda.topath);
free(nda.work);

pa->total_files[id] += ctrs.nondirs_processed;

return rc;
}

Expand Down Expand Up @@ -367,6 +380,7 @@ static int validate_source(struct input *in, const char *path, struct work **wor

new_work->root_parent.data = path;
new_work->root_parent.len = dirname_len(path, new_work->name_len);
new_work->level = 0;

char expathin[MAXPATH];
char expathout[MAXPATH];
Expand All @@ -393,7 +407,7 @@ static void sub_help(void) {

int main(int argc, char *argv[]) {
struct PoolArgs pa;
process_args_and_maybe_exit("hHvn:xz:k:M:s:C:" COMPRESS_OPT "q", 2, "input_dir... output_dir", &pa.in);
process_args_and_maybe_exit("hHvn:xy:z:k:M:s:C:" COMPRESS_OPT "q", 2, "input_dir... output_dir", &pa.in);

/* parse positional args, following the options */
/* does not have to be canonicalized */
Expand Down Expand Up @@ -441,7 +455,8 @@ int main(int argc, char *argv[]) {

fprintf(stdout, "Creating GUFI Index %s with %zu threads\n", pa.in.nameto.data, pa.in.maxthreads);

pa.total_files = calloc(pa.in.maxthreads, sizeof(uint64_t));
pa.total_dirs = calloc(pa.in.maxthreads, sizeof(uint64_t));
pa.total_nondirs = calloc(pa.in.maxthreads, sizeof(uint64_t));

struct start_end after_init;
clock_gettime(CLOCK_MONOTONIC, &after_init.start);
Expand Down Expand Up @@ -474,35 +489,35 @@ int main(int argc, char *argv[]) {
QPTPool_enqueue(pool, 0, processdir, copy);
i++;
}

QPTPool_stop(pool);

clock_gettime(CLOCK_MONOTONIC, &after_init.end);
const long double processtime = sec(nsec(&after_init));

/* don't count as part of processtime */

const uint64_t thread_count = QPTPool_threads_completed(pool);

QPTPool_destroy(pool);

for(size_t i = 0; i < root_count; i++) {
free(roots[i]);
}
free(roots);

uint64_t total_files = 0;
uint64_t total_dirs = 0;
uint64_t total_nondirs = 0;
for(size_t i = 0; i < pa.in.maxthreads; i++) {
total_files += pa.total_files[i];
total_dirs += pa.total_dirs[i];
total_nondirs += pa.total_nondirs[i];
}

free(pa.total_files);
free(pa.total_dirs);
free(pa.total_nondirs);

fprintf(stdout, "Total Dirs: %" PRIu64 "\n", thread_count);
fprintf(stdout, "Total Files: %" PRIu64 "\n", total_files);
fprintf(stdout, "Total Dirs: %" PRIu64 "\n", total_dirs);
fprintf(stdout, "Total Non-Dirs: %" PRIu64 "\n", total_nondirs);
fprintf(stdout, "Time Spent Indexing: %.2Lfs\n", processtime);
fprintf(stdout, "Dirs/Sec: %.2Lf\n", thread_count / processtime);
fprintf(stdout, "Files/Sec: %.2Lf\n", total_files / processtime);
fprintf(stdout, "Dirs/Sec: %.2Lf\n", total_dirs / processtime);
fprintf(stdout, "Non-Dirs/Sec: %.2Lf\n", total_nondirs / processtime);

free_xattr:
close_template_db(&pa.xattr);
Expand Down
3 changes: 1 addition & 2 deletions src/gufi_query/process_queries.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,8 @@ static size_t descend2(QPTPool_t *ctx,

size_t pushed = 0;
const size_t next_level = gqw->work.level + 1;
const int level_check = (next_level < max_level);

if (level_check) {
if (next_level <= max_level) {
/* Send subdirs to queue */
/* loop over dirents */
/* skip db.db and any filename listed in the trie struct */
Expand Down
2 changes: 1 addition & 1 deletion test/regression/compression.expected
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
$ "gufi_dir2index" -x -e "prefix" "search.d2i"
Creating GUFI Index search.d2i with 1 threads
Total Dirs: 6
Total Files: 14
Total Non-Dirs: 14

$ "gufi_dir2trace" -x -e "prefix" "trace"
Creating GUFI Traces trace with 1 threads
Expand Down
Loading

0 comments on commit 846a4fb

Please sign in to comment.