llmEval/scripts/download_mmlu.py

40 lines
899 B
Python

"""Download MMLU dataset to data/mmlu/."""
import argparse
import os
import urllib.request
import zipfile
REPO = "https://github.com/hendrycks/test/raw/master/"
FILES = [
"auxiliary.zip",
"dev.zip",
"test.zip",
"val.zip",
]
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=str, default="data/mmlu")
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
for fname in FILES:
url = REPO + fname
zip_path = os.path.join(args.output_dir, fname)
print(f"Downloading {url}...")
urllib.request.urlretrieve(url, zip_path)
print(f"Extracting {zip_path}...")
with zipfile.ZipFile(zip_path, "r") as z:
z.extractall(args.output_dir)
os.remove(zip_path)
print(f"MMLU data saved to {args.output_dir}")
if __name__ == "__main__":
main()