In order to scan for PII, we first need to enumerate all the files in the system. This is the process of finding all the files on the system that we want to scan.
This is a non-trivial problem because:
PII Crawler approaches this problem by:
We also want to handle archive/container files like .zip, tar.gz, etc. We don’t want to unzip them, but we do want to record the files within them. PII Crawler treats archives as a special kind of directory and enumerates the files within them.
The file path for a .zip file will be something like:
/path/to/archive.zip::filename.ext
Below is the (Go) code for how we enumerate and save file paths for PII scanning. On our hardware this function can enumerate 1 million files in 2-3 seconds.
// enumerateFiles will walk the directory hierarchy recursively starting at rootPath
// and return the number of files enumerated.
func enumerateFiles(rootPath string) (int, error) {
// dir walk and record file locations
// if rootPath is not specified, use current directory
if rootPath == "" {
slog.Warn("scan path not specified, using current directory")
rootPath = "."
}
fmt.Println("Enumerating files at path:", rootPath)
// create transaction to speed up file path DB inserts
tx, err := db.BeginTx(context.Background(), nil)
if err != nil {
return 0, err
}
var totalFilesEnumerated int
err = filepath.WalkDir(rootPath, func(path string, di fs.DirEntry, err error) error {
if di == nil {
return nil
}
if di.IsDir() {
// we only want to record files
return nil
}
// get absolute path
absPath, err2 := filepath.Abs(path)
if err2 != nil {
log.Println(err2)
}
fInfo, err2 := di.Info()
if err2 != nil {
log.Println(err2)
}
if !fInfo.Mode().IsRegular() {
// we don't want named pipes and other non regular files
return nil
}
totalFilesEnumerated++
// get file extension and normalize it
extension := strings.ToLower(filepath.Ext(path))
// mark file as skip if it matches any ignore patterns
var skip uint
for _, ignorePattern := range baseFilenamePrefixIgnore.List() {
if strings.HasPrefix(filepath.Base(absPath), ignorePattern) {
skip = 1 // sqlite uses 0/1 for true/false
}
}
// handle archive type files
if archiveExtensions.Has(extension) {
// file is an archive type and has files within
m, err := ziputil.FileEnumerate(absPath)
if err != nil {
log.Printf("couldn't open ZIP file %q: %s", absPath, err.Error())
return nil
}
for insidePath, insideSize := range m {
fullPath := fmt.Sprintf("%s::%s", absPath, insidePath)
insideExtension := strings.ToLower(filepath.Ext(insidePath))
// record path and file size
_, err2 = tx.Exec(`insert into files (path, size, extension, skip, parent_path)
values (?,?,?,?,?)
on conflict do nothing`, fullPath, insideSize, insideExtension, 0, absPath) // TODO SKIP
if err2 != nil {
log.Println(err2)
}
}
return nil // don't record below to avoid putting the parent .zip file in the db
}
// record path and file size
_, err2 = tx.Exec(`insert into files (path, size, extension, skip)
values (?,?,?,?)
on conflict do nothing`, absPath, fInfo.Size(), extension, skip)
if err2 != nil {
log.Println(err2)
}
return err2
})
if err != nil {
log.Println(err)
}
err = tx.Commit()
if err != nil {
log.Fatal(err)
}
return totalFilesEnumerated, err
}
Looking for Personally Identifiable Information (PII)?
Download PII Crawler and find PII today!
💌 Get notified on new features and updates