Workhorse - Revision 1.4.0
Changes:
+ Hard Drives: 4x Crucial P3 Plus 500GB PCIe Gen4 NVME (5000MB/s)
Peripherals: + 2x ANYOYO 40Gbps M.2 NVMe SSD Enclosure Thunderbolt 3/4 Compatible
Notes
I’ve been pondering if there’s a way to add some extra storage since I’m 100% out of m.2 slots at this point on the rig, which led me down the path of Thunderbolt. The motherboard comes with 2x Gen 4 Thunderbolt ports, which means each theoretically should give me speeds of up to 40Gbps.
Now what if we could then shove M.2’s onto each of these, running in RAID to cap out the speeds (even though with the right M.2 we likely could get there already) to beat the speeds of my 2.5G ethernet connecting to my NAS I currently have for loading model weights.
I ended up setting up two of these ANYOYO 40Gbps enclosures each with 2x 500GB M.2’s since they’re cheap enough (\(150) each, at least much cheaper than the 300\)+ RAID thunderbolt cards I was looking at that made no sense since the second you get above 2 M.2’s you’ve already fully saturated your Thunderbolt 3/4 speeds.
Crucially important: I am running two seperate RAID’s on each Thunderbolt, not one big raid across both
From there it was as simple as configuring RAID:
sudo mdadm --create --verbose /dev/md/raid0n0 --level=0 --raid-devices=2 /dev/nvme2n1 /dev/nvme4n1
sudo mdadm --create --verbose /dev/md/raid0n1 --level=0 --raid-devices=2 /dev/nvme1n2 /dev/nvme3n1
Splice up some model weights between the two RAID devices evenly (I chose to split Qwen3 30B, so 1-8 go to raid0n0
9-16 go to raid0n1
) and then come up with a clever parallel-loading schema which will saturate the CPU with the model weights to test things:
import os
import time
import multiprocessing as mp
from pathlib import Path
from typing import List, Tuple, Dict
import safetensors
from safetensors.torch import load_file
import torch
import numpy as np
from tqdm import tqdm
def get_weight_files(dir1: str, dir2: str) -> Tuple[List[str], List[str]]:
"""Get weight files ensuring sequential ordering from each RAID."""
# First RAID has shards 1-8
= sorted([
files1 str(p) for p in Path(dir1).glob("*.safetensors")
])
# Second RAID has shards 9-16 plus config files
= sorted([
files2 str(p) for p in Path(dir2).glob("*.safetensors")
])
if len(files1) != 8 or len(files2) != 8:
raise ValueError(f"Expected 8 shards on each RAID. Found {len(files1)} on RAID0 and {len(files2)} on RAID1")
# Get shard numbers for information
= [int(Path(f).name.split('-')[1]) for f in files1]
shard_nums1 = [int(Path(f).name.split('-')[1]) for f in files2]
shard_nums2
print(f"Found {len(files1)} shards in {dir1} (shards 1-8)")
print(f"Found {len(files2)} shards in {dir2} (shards 9-16)")
# List other files in RAID1 for information
= [
other_files for p in Path(dir2).glob("*")
p.name if p.is_file() and not p.name.startswith("model-") and not p.name.endswith(".safetensors")
]if other_files:
print("\nAdditional files in RAID1:")
for f in sorted(other_files):
print(f" {f}")
return files1, files2
def load_raid_shards(args: Tuple[List[str], str, int]) -> Dict:
"""Load all shards from one RAID drive."""
= args
shard_files, raid_name, worker_id
= []
results = 0
total_size = 0
total_time
# Process all shards from this RAID sequentially
for i, shard_file in enumerate(shard_files):
= time.time()
start_time = load_file(shard_file)
tensors
# Force load into memory
= 0
shard_size for v in tensors.values():
if isinstance(v, torch.Tensor):
= v.mean().item()
_ += v.nelement() * v.element_size()
shard_size
= time.time() - start_time
load_time = shard_size / (1024 ** 3)
size_gb
+= size_gb
total_size += load_time
total_time
results.append({'file': shard_file,
'size': size_gb,
'time': load_time,
'shard': int(Path(shard_file).name.split('-')[1])
})
return {
'raid_name': raid_name,
'worker_id': worker_id,
'shards': results,
'total_size': total_size,
'total_time': total_time
}
def measure_split_transfer_speed(dir1: str, dir2: str):
"""Measure transfer speed of loading split weights from two RAID drives."""
print(f"Measuring transfer speed between {dir1} and {dir2}")
try:
# Get weight files from both directories
= get_weight_files(dir1, dir2)
files1, files2
# Use 2 workers - one for each RAID
= 2
num_workers print(f"\nUsing {num_workers} workers for parallel loading")
print("Worker 0 will process all shards from RAID0")
print("Worker 1 will process all shards from RAID1")
# Prepare arguments for workers
= [
worker_args "RAID0", 0),
(files1, "RAID1", 1)
(files2,
]
with mp.Pool(processes=num_workers) as pool:
print("\nLoading shards in parallel...")
= pool.map(load_raid_shards, worker_args)
results
# Process results
for raid_result in results:
= raid_result['raid_name']
raid_name print(f"\n{raid_name} Results:")
print(f"Total Size: {raid_result['total_size']:.2f} GB")
print(f"Total Time: {raid_result['total_time']:.2f} seconds")
print(f"Average Speed: {raid_result['total_size']/raid_result['total_time']:.2f} GB/s")
print("\nIndividual Shard Details:")
for shard in raid_result['shards']:
print(f"Shard {shard['shard']}: {Path(shard['file']).name}")
print(f" Size: {shard['size']:.2f} GB")
print(f" Speed: {shard['size']/shard['time']:.2f} GB/s")
# Calculate overall statistics
= sum(r['total_size'] for r in results)
total_size = max(r['total_time'] for r in results)
max_time
print("\nOverall Statistics:")
for r in results:
print(f"{r['raid_name']}: {r['total_size']:.2f} GB at {r['total_size']/r['total_time']:.2f} GB/s")
print(f"Combined Data Loaded: {total_size:.2f} GB")
print(f"Total Time: {max_time:.2f} seconds")
print(f"Effective Speed: {total_size/max_time:.2f} GB/s")
except Exception as e:
print(f"Error during measurement: {str(e)}")
if __name__ == "__main__":
= "/mnt/raid0n0/Qwen3-30B-A3B"
SOURCE_DIR1 = "/mnt/raid0n1/Qwen3-30B-A3B"
SOURCE_DIR2
try:
measure_split_transfer_speed(SOURCE_DIR1, SOURCE_DIR2)except KeyboardInterrupt:
print("\nMeasurement interrupted by user")
except Exception as e:
print(f"Error during measurement: {str(e)}")
And then just test it out:
sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'
python test.py
What I wound up getting:
Baseline 2.5G Ethernet:
Overall Statistics:
Total Data Loaded: 56.87 GB
Total Time: 294.17 seconds
Average Speed: 0.19 GB/s
Initializing empty model...
Loading tokenizer...
Loading weights into model...
Model loading completed in 0.34 seconds
Testing model with a simple prompt...
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Prompt: Hello, I am
Response: Hello, I am trying to find the value of the integral from 0 to 1 of (x^2 +
Using RAID + Thunderbolt:
Overall Statistics:
RAID0: 27.07 GB at 1.07 GB/s
RAID1: 29.80 GB at 1.10 GB/s
Combined Data Loaded: 56.87 GB
Total Time: 27.11 seconds
Average Speed: 2.10 GB/s
Initializing empty model...
Loading tokenizer...
Loading weights into model...
Model loading completed in 0.33 seconds
Testing model with a simple prompt...
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Prompt: Hello, I am
Response: Hello, I am trying to find the value of the integral from 0 to 1 of (x^2 +
As a result I saw a 10x increase in model loading speed!