2023-12-03 02:08:29 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
2023-12-04 21:28:04 +00:00
|
|
|
"context"
|
2023-12-03 02:08:29 +00:00
|
|
|
"encoding/json"
|
|
|
|
"flag"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"io/fs"
|
|
|
|
"os"
|
|
|
|
"os/signal"
|
|
|
|
"path/filepath"
|
2023-12-04 21:28:04 +00:00
|
|
|
"runtime"
|
|
|
|
"sync"
|
2023-12-03 02:08:29 +00:00
|
|
|
"sync/atomic"
|
|
|
|
"syscall"
|
2023-12-04 21:28:04 +00:00
|
|
|
|
|
|
|
"github.com/sourcegraph/conc/pool"
|
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
sourceDir = flag.String("source", ".", "source data directory")
|
|
|
|
tempDir = flag.String("temp", "/tmp", "temporary storage directory")
|
|
|
|
dbPath = flag.String("db", "datashake.json", "database file path")
|
|
|
|
minimumSize = flag.Int64("min-size", 1024*1024, "minimum size in bytes")
|
|
|
|
concurrency = flag.Int("concurrency", 1, "concurrent processing limit")
|
2023-12-03 02:08:29 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
func main() {
|
2023-12-04 21:28:04 +00:00
|
|
|
ctx, stop := signal.NotifyContext(
|
|
|
|
context.Background(),
|
|
|
|
syscall.SIGINT, syscall.SIGTERM,
|
|
|
|
)
|
|
|
|
defer stop()
|
|
|
|
|
2023-12-03 02:08:29 +00:00
|
|
|
flag.Parse()
|
|
|
|
|
2023-12-04 21:28:04 +00:00
|
|
|
concurrency := *concurrency
|
|
|
|
if concurrency < 1 {
|
|
|
|
concurrency = runtime.GOMAXPROCS(0)
|
|
|
|
}
|
|
|
|
tasks = pool.New().WithMaxGoroutines(concurrency)
|
|
|
|
|
2023-12-03 02:08:29 +00:00
|
|
|
if err := loadDb(); err != nil {
|
|
|
|
fmt.Println("error", err)
|
|
|
|
os.Exit(1)
|
|
|
|
}
|
|
|
|
|
|
|
|
running.Store(true)
|
|
|
|
go func() {
|
|
|
|
if err := filepath.WalkDir(*sourceDir, process); err != nil {
|
|
|
|
errors <- err
|
|
|
|
}
|
2023-12-04 21:28:04 +00:00
|
|
|
pending.Wait()
|
2023-12-03 02:08:29 +00:00
|
|
|
close(errors)
|
|
|
|
}()
|
|
|
|
|
|
|
|
Loop:
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case err, ok := <-errors:
|
|
|
|
if ok {
|
|
|
|
fmt.Println("error:", err)
|
|
|
|
} else {
|
|
|
|
break Loop
|
|
|
|
}
|
2023-12-04 21:28:04 +00:00
|
|
|
case <-ctx.Done():
|
2023-12-03 02:08:29 +00:00
|
|
|
running.Store(false)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := saveDb(); err != nil {
|
|
|
|
fmt.Println("error", err)
|
|
|
|
os.Exit(1)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-04 21:28:04 +00:00
|
|
|
var (
|
|
|
|
running atomic.Bool
|
2023-12-03 02:08:29 +00:00
|
|
|
|
2023-12-04 21:28:04 +00:00
|
|
|
tasks *pool.Pool
|
|
|
|
pending sync.WaitGroup
|
|
|
|
errors = make(chan error)
|
2023-12-03 02:08:29 +00:00
|
|
|
|
2023-12-04 21:28:04 +00:00
|
|
|
db = DB{
|
|
|
|
Processed: make(map[string]struct{}),
|
2023-12-03 02:08:29 +00:00
|
|
|
}
|
2023-12-04 21:28:04 +00:00
|
|
|
dbLock sync.Mutex
|
|
|
|
)
|
2023-12-03 02:08:29 +00:00
|
|
|
|
2023-12-04 21:28:04 +00:00
|
|
|
// process is a visitor for `filepath.WalkDir` that performs the rebalancing
|
|
|
|
// algorithm against regular files.
|
2023-12-03 02:08:29 +00:00
|
|
|
//
|
2023-12-04 21:28:04 +00:00
|
|
|
// This function normally never returns an error, since that would stop the
|
2023-12-03 02:08:29 +00:00
|
|
|
// directory walk. Instead, any errors are sent to the `errors` channel.
|
2023-12-04 21:28:04 +00:00
|
|
|
func process(path string, d fs.DirEntry, err error) (typicallyNil error) {
|
2023-12-03 02:08:29 +00:00
|
|
|
if !running.Load() {
|
|
|
|
return fs.SkipAll
|
|
|
|
}
|
2023-12-04 21:28:04 +00:00
|
|
|
if err != nil || d.IsDir() || !d.Type().IsRegular() {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
pending.Add(1)
|
|
|
|
tasks.Go(func() {
|
|
|
|
defer pending.Done()
|
|
|
|
if running.Load() {
|
|
|
|
work(path, d)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
return
|
|
|
|
}
|
2023-12-03 02:08:29 +00:00
|
|
|
|
2023-12-04 21:28:04 +00:00
|
|
|
// work rebalances a single file.
|
|
|
|
func work(path string, d fs.DirEntry) {
|
|
|
|
var err error
|
2023-12-03 02:08:29 +00:00
|
|
|
defer func() {
|
|
|
|
if err != nil {
|
|
|
|
errors <- err
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
srcFileName := d.Name()
|
|
|
|
srcFilePath, err := filepath.Abs(path)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if db.Contains(srcFilePath) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
srcStat, err := os.Stat(srcFilePath)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if srcStat.Size() < *minimumSize {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
tempDirPath, err := os.MkdirTemp(*tempDir, "*")
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
tempFilePath := filepath.Join(tempDirPath, srcFileName)
|
|
|
|
safeToRemoveTemp := true
|
|
|
|
defer func() {
|
|
|
|
if !safeToRemoveTemp {
|
|
|
|
err := fmt.Errorf(
|
|
|
|
"%s may be lost in %s",
|
|
|
|
srcFilePath, tempDirPath,
|
|
|
|
)
|
|
|
|
errors <- err
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if err := os.RemoveAll(tempDirPath); err != nil {
|
|
|
|
errors <- err
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
err = copy(srcFilePath, tempFilePath)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
safeToRemoveTemp = false
|
|
|
|
err = os.Remove(srcFilePath)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
err = copy(tempFilePath, srcFilePath)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
safeToRemoveTemp = true
|
|
|
|
db.Remember(srcFilePath)
|
|
|
|
}
|
|
|
|
|
|
|
|
// copy opens the file from the source path, then creates a copy of it at the
|
|
|
|
// destination path. The mode, uid and gid bits from the source file are
|
|
|
|
// replicated in the copy.
|
|
|
|
func copy(srcPath, dstPath string) error {
|
|
|
|
fmt.Println("copying", srcPath, "to", dstPath)
|
|
|
|
srcFile, err := os.Open(srcPath)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer func() {
|
|
|
|
_ = srcFile.Close()
|
|
|
|
}()
|
|
|
|
|
|
|
|
dstFile, err := os.Create(dstPath)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer func() {
|
|
|
|
_ = dstFile.Close()
|
|
|
|
}()
|
|
|
|
|
|
|
|
srcStat, err := os.Stat(srcPath)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
err = os.Chmod(dstPath, srcStat.Mode())
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if sysStat, ok := srcStat.Sys().(*syscall.Stat_t); ok {
|
|
|
|
uid := int(sysStat.Uid)
|
|
|
|
gid := int(sysStat.Gid)
|
|
|
|
err = os.Chown(dstPath, uid, gid)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
_, err = io.Copy(dstFile, srcFile)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2023-12-04 21:28:04 +00:00
|
|
|
// DB holds a set of files which have been rebalanced.
|
|
|
|
//
|
|
|
|
// These files are skipped on future runs of the program.
|
|
|
|
//
|
|
|
|
// The database is loaded from a JSON file when the program starts and saved
|
|
|
|
// back to that JSON file as the program finishes.
|
2023-12-03 02:08:29 +00:00
|
|
|
type DB struct {
|
|
|
|
Processed map[string]struct{}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (db *DB) Contains(path string) bool {
|
2023-12-04 21:28:04 +00:00
|
|
|
dbLock.Lock()
|
|
|
|
defer dbLock.Unlock()
|
2023-12-03 02:08:29 +00:00
|
|
|
_, ok := db.Processed[path]
|
|
|
|
return ok
|
|
|
|
}
|
|
|
|
|
|
|
|
func (db *DB) Remember(path string) {
|
2023-12-04 21:28:04 +00:00
|
|
|
dbLock.Lock()
|
|
|
|
defer dbLock.Unlock()
|
2023-12-03 02:08:29 +00:00
|
|
|
db.Processed[path] = struct{}{}
|
|
|
|
}
|
2023-12-04 21:28:04 +00:00
|
|
|
|
|
|
|
func loadDb() error {
|
|
|
|
if *dbPath == "" {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
dbFile, err := os.Open(*dbPath)
|
|
|
|
if err != nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
defer func() {
|
|
|
|
_ = dbFile.Close()
|
|
|
|
}()
|
|
|
|
d := json.NewDecoder(dbFile)
|
|
|
|
err = d.Decode(&db)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
func saveDb() error {
|
|
|
|
if *dbPath == "" {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
dbFile, err := os.Create(*dbPath)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer func() {
|
|
|
|
_ = dbFile.Close()
|
|
|
|
}()
|
|
|
|
e := json.NewEncoder(dbFile)
|
|
|
|
err = e.Encode(&db)
|
|
|
|
return err
|
|
|
|
}
|