Artificial Intelligence

Eine Codierungsimplementierung zum Komprimieren und Benchmarken von anweisungsabgestimmten LLMs mit FP8, GPTQ und SmoothQuant-Quantisierung unter Verwendung von llmcompressor

Von admin Mai 18, 2026 #anweisungsabgestimmten, #Benchmarken, #Codierungsimplementierung, #Eine, #FP8, #GPTQ, #Komprimieren, #llmcompressor, #LLMs, #MIT, #SmoothQuantQuantisierung, #und, #unter, #Verwendung, #Von, #zum

import subprocess, sys
def pip(*pkgs):
   subprocess.check_call((sys.executable, "-m", "pip", "set up", "-q", *pkgs))
pip("llmcompressor", "compressed-tensors",
   "transformers>=4.45", "speed up", "datasets")
import os, gc, time, json, math
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
assert torch.cuda.is_available(), 
   "Allow a GPU: Runtime > Change runtime sort > T4 GPU"
print("GPU:", torch.cuda.get_device_name(0),
     "| CUDA:", torch.model.cuda,
     "| torch:", torch.__version__)
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
WORKDIR = Path("/content material/quant_lab"); WORKDIR.mkdir(exist_ok=True)
os.chdir(WORKDIR)
def free_mem():
   gc.accumulate(); torch.cuda.empty_cache()
def dir_size_gb(path):
   complete = 0
   for root, _, information in os.stroll(path):
       for f in information:
           complete += os.path.getsize(os.path.be part of(root, f))
   return complete / 1e9
def time_generation(mannequin, tok, immediate, max_new_tokens=64):
   """Grasping decode; reviews latency & tokens/sec after a short warmup."""
   inputs = tok(immediate, return_tensors="pt").to(mannequin.system)
   _ = mannequin.generate(**inputs, max_new_tokens=4, do_sample=False)
   torch.cuda.synchronize()
   t0 = time.time()
   out = mannequin.generate(**inputs, max_new_tokens=max_new_tokens,
                        do_sample=False, pad_token_id=tok.eos_token_id)
   torch.cuda.synchronize()
   dt = time.time() - t0
   new_ids = out(0)(inputs("input_ids").form(1):)
   return tok.decode(new_ids, skip_special_tokens=True), dt, max_new_tokens/dt
@torch.no_grad()
def wikitext_ppl(mannequin, tok, seq_len=512, max_chunks=20, stride=512):
   """Gentle WikiText-2 perplexity probe (quick, indicative)."""
   ds = load_dataset("wikitext", "wikitext-2-raw-v1", cut up="take a look at")
   textual content = "nn".be part of(t for t in ds("textual content")(:400) if t.strip())
   enc = tok(textual content, return_tensors="pt").input_ids.to(mannequin.system)
   nll_sum, tok_count = 0.0, 0
   for start in vary(0, enc.dimension(1) - seq_len, stride):
       chunk = enc(:, start:start+seq_len)
       out = mannequin(chunk, labels=chunk)
       nll_sum += out.loss.float().merchandise() * seq_len
       tok_count += seq_len
       if tok_count // seq_len >= max_chunks: break
   return math.exp(nll_sum / tok_count)
outcomes = {}
PROMPT = ("<|im_start|>usernIn two sentences, clarify why post-training "
         "quantization works for big language fashions.<|im_end|>n"
         "<|im_start|>assistantn")
def benchmark(label, model_path_or_id):
   free_mem()
   print(f"n──── benchmarking: {label} ────")
   tok = AutoTokenizer.from_pretrained(model_path_or_id)
   m = AutoModelForCausalLM.from_pretrained(
           model_path_or_id, torch_dtype="auto", device_map="cuda").eval()
   pattern, dt, tps = time_generation(m, tok, PROMPT)
   ppl = wikitext_ppl(m, tok)
   dimension = dir_size_gb(model_path_or_id) if os.path.isdir(str(model_path_or_id)) else None
   outcomes(label) = {"size_gb": dimension, "ppl": spherical(ppl, 3),
                     "latency_s": spherical(dt, 3), "tok_per_s": spherical(tps, 1),
                     "pattern": pattern.strip().substitute("n", " ")(:180)}
   print(json.dumps(outcomes(label), indent=2))
   del m; free_mem()

Von admin

Schreibe einen Kommentar Antworten abbrechen

Machine Learning

Loop Engineering für die RAG-Generierung: High-Ok einzeln iterieren

Kaggle + Googles kostenloser 5-tägiger Agentic AI-Kurs

Umfragestatistik: Poststratifizierung ohne Informationen zur Bevölkerungsebene

Artificial Intelligence

Synthetische vs. reale Daten für die Robotik: Leitfaden 2026