No description
  • Rust 92.7%
  • Python 7.3%
Find a file
2026-06-01 15:41:37 +02:00
local-s3 Fix Rust upload 2026-05-04 23:55:38 +02:00
perf-charts Produce result charts 2026-05-05 14:52:54 +02:00
results Add results 2026-06-01 15:41:37 +02:00
rust Produce result charts 2026-05-05 14:52:54 +02:00
spark Produce result charts 2026-05-05 14:52:54 +02:00
.gitignore Fix Rust upload 2026-05-04 23:55:38 +02:00
README.md Produce result charts 2026-05-05 14:52:54 +02:00
rust-toolchain CLI to generate a bunch of files 2026-03-22 20:59:01 +01:00

Ingest many json files

How to run

In a machine A (192.168.1.15 in my network):

cd local-s3
BIND=0.0.0.0 cargo run --release -- serve

In a machine B:

# Install tools
cargo install tally
sudo apt-get install dstat

# Run stat collection
dstat -tcdnm -o data/dstat.csv 1 > data/dstat.csv

# Generate files
(
cd local-s3
cargo run --release -- generate-files --host http://192.168.1.15:9000 --num-files 10000 --prefix files/10000/
)

# Use Spark
(
cd spark

sleep 3
echo Spark p0 $(date +%T) >> ../data/log.txt
tally uv run main.py --ingest --host http://192.168.1.15:9000 --files files/10000/ --table delta-spark/10000-p0/

sleep 3
echo Spark p3 $(date +%T) >> ../data/log.txt
tally uv run main.py --ingest --host http://192.168.1.15:9000 --files files/10000/ --table delta-spark/10000-p3/ --partitions 3

sleep 3
echo Spark p6 $(date +%T) >> ../data/log.txt
tally uv run main.py --ingest --host http://192.168.1.15:9000 --files files/10000/ --table delta-spark/10000-p6/ --partitions 6

sleep 3
echo Spark p10 $(date +%T) >> ../data/log.txt
tally uv run main.py --ingest --host http://192.168.1.15:9000 --files files/10000/ --table delta-spark/10000-p10/ --partitions 10
)

# Use Rust
(
cd rust
cargo build --release

sleep 3
echo Rust ts512 $(date +%T) >> ../data/log.txt
tally ./target/release/rust --host http://192.168.1.15:9000 --files files/10000/ --table delta-rust/10000-ts512/ --target-size-mib 512

sleep 3
echo Rust ts256 $(date +%T) >> ../data/log.txt
tally ./target/release/rust --host http://192.168.1.15:9000 --files files/10000/ --table delta-rust/10000-ts256/ --target-size-mib 256

sleep 3
echo Rust ts128 $(date +%T) >> ../data/log.txt
tally ./target/release/rust --host http://192.168.1.15:9000 --files files/10000/ --table delta-rust/10000-ts128/ --target-size-mib 128
)
  
# Check
(
cd spark
uv run main.py --show --host http://192.168.1.15:9000 --table delta-spark/10000-p0/
uv run main.py --show --host http://192.168.1.15:9000 --table delta-spark/10000-p3/
uv run main.py --show --host http://192.168.1.15:9000 --table delta-spark/10000-p6/
uv run main.py --show --host http://192.168.1.15:9000 --table delta-spark/10000-p10/
uv run main.py --show --host http://192.168.1.15:9000 --table delta-rust/10000-ts512/
uv run main.py --show --host http://192.168.1.15:9000 --table delta-rust/10000-ts256/
uv run main.py --show --host http://192.168.1.15:9000 --table delta-rust/10000-ts128/
)
  
# Deinstall tool
sudo apt-get remove dstat