-
Partially explained in TFLOPS, MFU
-
MFU= the ratio of the observed throughput (tokens-per-second) relative to the theoretical maximum throughput of a system operating at peak FLOPs.
-
Defined in Appendix B of PaLM paper https://arxiv.org/pdf/2204.02311
Transformer
torchtitan
https://github.com/pytorch/torchtitan/blob/b0ed7f075921357b01e28fddc6d90a2cc410bab3/train.py#L434
def get_num_flop_per_token(num_params: int, model_config, seq_len) -> int:
l, h, q, t = (
model_config.n_layers,
model_config.n_heads,
model_config.dim // model_config.n_heads,
seq_len,
)
# Reasoning behind the factor of 12 for the self-attention part of the formula:
# 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
# 2. the flash attention does 1 more matmul recomputation in the backward
# but recomputation should not be counted in calculating MFU (+0)
# 3. each matmul performs 1 multiplication and 1 addition (*2)
# 4. we follow the convention and do not account for sparsity in causal attention
flop_per_token = 6 * num_params + 12 * l * h * q * t
return flop_per_token
num_flop_per_token = get_num_flop_per_token(
get_num_params(whole_model, exclude_embedding=True),
model_config,
job_config.training.seq_len,
)
wps = ntokens_since_last_log / (
time_delta * parallel_dims.model_parallel_size
)
# model FLOPS utilization
# For its definition and calculation, please refer to the PaLM paper:
# https://arxiv.org/abs/2204.02311
mfu = 100 * num_flop_per_token * wps / gpu_peak_flops