Instructions to use DeepXR/Helion-V2.5-Rnd with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use DeepXR/Helion-V2.5-Rnd with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use DeepXR/Helion-V2.5-Rnd with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "DeepXR/Helion-V2.5-Rnd"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/DeepXR/Helion-V2.5-Rnd

SGLang

How to use DeepXR/Helion-V2.5-Rnd with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "DeepXR/Helion-V2.5-Rnd" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "DeepXR/Helion-V2.5-Rnd" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use DeepXR/Helion-V2.5-Rnd with Docker Model Runner:
```
docker model run hf.co/DeepXR/Helion-V2.5-Rnd
```

Helion-V2.5-Rnd / inference /evaluate.py

Trouter-Library

Create inference/evaluate.py

f15baf7 verified 6 months ago

raw

history blame contribute delete

12.9 kB

	#!/usr/bin/env python3
	"""
	Helion-2.5-Rnd Evaluation Script
	Comprehensive benchmark evaluation across multiple datasets
	"""

	import argparse
	import json
	import logging
	import os
	from collections import defaultdict
	from pathlib import Path
	from typing import Dict, List, Optional

	import torch
	from datasets import load_dataset
	from tqdm import tqdm
	from transformers import AutoModelForCausalLM, AutoTokenizer

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)


	class HelionEvaluator:
	"""Evaluation framework for Helion model"""

	def __init__(
	self,
	model_path: str,
	device: str = "cuda",
	batch_size: int = 1,
	max_length: int = 2048
	):
	"""
	Initialize evaluator

	Args:
	model_path: Path to model or HuggingFace model ID
	device: Device to run evaluation on
	batch_size: Batch size for evaluation
	max_length: Maximum sequence length
	"""
	logger.info(f"Loading model from {model_path}")

	self.tokenizer = AutoTokenizer.from_pretrained(model_path)
	self.model = AutoModelForCausalLM.from_pretrained(
	model_path,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	)

	self.device = device
	self.batch_size = batch_size
	self.max_length = max_length

	logger.info("Model loaded successfully")

	def generate(
	self,
	prompt: str,
	max_new_tokens: int = 512,
	temperature: float = 0.0,
	**kwargs
	) -> str:
	"""Generate text from prompt"""
	inputs = self.tokenizer(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=self.max_length
	).to(self.device)

	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	temperature=temperature if temperature > 0 else 1.0,
	do_sample=temperature > 0,
	pad_token_id=self.tokenizer.pad_token_id,
	**kwargs
	)

	response = self.tokenizer.decode(
	outputs[0][inputs['input_ids'].shape[1]:],
	skip_special_tokens=True
	)

	return response.strip()

	def evaluate_mmlu(self, num_samples: Optional[int] = None) -> Dict:
	"""Evaluate on MMLU benchmark"""
	logger.info("Evaluating on MMLU...")

	dataset = load_dataset("cais/mmlu", "all", split="test")
	if num_samples:
	dataset = dataset.select(range(min(num_samples, len(dataset))))

	correct = 0
	total = 0

	for example in tqdm(dataset, desc="MMLU"):
	question = example["question"]
	choices = example["choices"]
	answer = example["answer"]

	# Format prompt
	prompt = f"Question: {question}\n\nChoices:\n"
	for i, choice in enumerate(choices):
	prompt += f"{chr(65+i)}. {choice}\n"
	prompt += "\nAnswer: "

	# Generate response
	response = self.generate(prompt, max_new_tokens=10, temperature=0.0)

	# Extract answer
	pred = response.strip()[0].upper() if response else ""
	correct_answer = chr(65 + answer)

	if pred == correct_answer:
	correct += 1
	total += 1

	accuracy = correct / total if total > 0 else 0

	return {
	"benchmark": "MMLU",
	"accuracy": accuracy,
	"correct": correct,
	"total": total
	}

	def evaluate_gsm8k(self, num_samples: Optional[int] = None) -> Dict:
	"""Evaluate on GSM8K mathematical reasoning"""
	logger.info("Evaluating on GSM8K...")

	dataset = load_dataset("gsm8k", "main", split="test")
	if num_samples:
	dataset = dataset.select(range(min(num_samples, len(dataset))))

	correct = 0
	total = 0

	for example in tqdm(dataset, desc="GSM8K"):
	question = example["question"]
	answer = example["answer"]

	# Extract numerical answer
	import re
	match = re.search(r'####\s(-?\d+(?:,\d+)(?:\.\d+)?)', answer)
	if not match:
	continue

	correct_answer = match.group(1).replace(',', '')

	# Format prompt
	prompt = f"Question: {question}\n\nLet's solve this step by step:\n"

	# Generate response
	response = self.generate(prompt, max_new_tokens=512, temperature=0.0)

	# Extract predicted answer
	pred_match = re.search(r'(?:answer is\|=)\s(-?\d+(?:,\d+)(?:\.\d+)?)', response.lower())
	if pred_match:
	pred_answer = pred_match.group(1).replace(',', '')
	if pred_answer == correct_answer:
	correct += 1

	total += 1

	accuracy = correct / total if total > 0 else 0

	return {
	"benchmark": "GSM8K",
	"accuracy": accuracy,
	"correct": correct,
	"total": total
	}

	def evaluate_humaneval(self, num_samples: Optional[int] = None) -> Dict:
	"""Evaluate on HumanEval code generation"""
	logger.info("Evaluating on HumanEval...")

	try:
	dataset = load_dataset("openai_humaneval", split="test")
	except:
	logger.warning("HumanEval dataset not available")
	return {"benchmark": "HumanEval", "error": "Dataset not available"}

	if num_samples:
	dataset = dataset.select(range(min(num_samples, len(dataset))))

	results = []

	for example in tqdm(dataset, desc="HumanEval"):
	prompt = example["prompt"]

	# Generate code
	full_prompt = f"Complete the following Python function:\n\n{prompt}"
	response = self.generate(
	full_prompt,
	max_new_tokens=512,
	temperature=0.0
	)

	# Extract code
	code = prompt + response

	results.append({
	"task_id": example["task_id"],
	"completion": code,
	"test": example["test"]
	})

	# Note: Full evaluation requires executing code
	# This is a simplified version
	return {
	"benchmark": "HumanEval",
	"samples_generated": len(results),
	"note": "Full evaluation requires code execution framework"
	}

	def evaluate_truthfulqa(self, num_samples: Optional[int] = None) -> Dict:
	"""Evaluate on TruthfulQA"""
	logger.info("Evaluating on TruthfulQA...")

	dataset = load_dataset("truthful_qa", "generation", split="validation")
	if num_samples:
	dataset = dataset.select(range(min(num_samples, len(dataset))))

	responses = []

	for example in tqdm(dataset, desc="TruthfulQA"):
	question = example["question"]

	prompt = f"Question: {question}\n\nProvide a truthful and accurate answer:\nAnswer: "

	response = self.generate(prompt, max_new_tokens=256, temperature=0.0)

	responses.append({
	"question": question,
	"response": response,
	"best_answer": example["best_answer"],
	"correct_answers": example["correct_answers"],
	"incorrect_answers": example["incorrect_answers"]
	})

	return {
	"benchmark": "TruthfulQA",
	"samples_evaluated": len(responses),
	"note": "Manual review required for truthfulness assessment"
	}

	def evaluate_all(
	self,
	output_file: Optional[str] = None,
	num_samples: Optional[int] = None
	) -> Dict:
	"""Run all evaluations"""
	logger.info("Starting comprehensive evaluation...")

	results = {
	"model": "DeepXR/Helion-2.5-Rnd",
	"benchmarks": {}
	}

	# Run evaluations
	try:
	results["benchmarks"]["mmlu"] = self.evaluate_mmlu(num_samples)
	except Exception as e:
	logger.error(f"MMLU evaluation failed: {e}")
	results["benchmarks"]["mmlu"] = {"error": str(e)}

	try:
	results["benchmarks"]["gsm8k"] = self.evaluate_gsm8k(num_samples)
	except Exception as e:
	logger.error(f"GSM8K evaluation failed: {e}")
	results["benchmarks"]["gsm8k"] = {"error": str(e)}

	try:
	results["benchmarks"]["humaneval"] = self.evaluate_humaneval(num_samples)
	except Exception as e:
	logger.error(f"HumanEval evaluation failed: {e}")
	results["benchmarks"]["humaneval"] = {"error": str(e)}

	try:
	results["benchmarks"]["truthfulqa"] = self.evaluate_truthfulqa(num_samples)
	except Exception as e:
	logger.error(f"TruthfulQA evaluation failed: {e}")
	results["benchmarks"]["truthfulqa"] = {"error": str(e)}

	# Save results
	if output_file:
	output_path = Path(output_file)
	output_path.parent.mkdir(parents=True, exist_ok=True)

	with open(output_path, 'w') as f:
	json.dump(results, f, indent=2)

	logger.info(f"Results saved to {output_path}")

	# Print summary
	logger.info("\n" + "="*50)
	logger.info("EVALUATION SUMMARY")
	logger.info("="*50)

	for benchmark, result in results["benchmarks"].items():
	if "accuracy" in result:
	logger.info(f"{benchmark.upper()}: {result['accuracy']:.2%}")
	elif "error" in result:
	logger.info(f"{benchmark.upper()}: ERROR - {result['error']}")
	else:
	logger.info(f"{benchmark.upper()}: {result.get('note', 'Completed')}")

	return results


	def main():
	"""Main evaluation entry point"""
	parser = argparse.ArgumentParser(description="Evaluate Helion model")
	parser.add_argument(
	"--model",
	type=str,
	required=True,
	help="Model path or HuggingFace ID"
	)
	parser.add_argument(
	"--benchmarks",
	type=str,
	nargs="+",
	default=["all"],
	choices=["all", "mmlu", "gsm8k", "humaneval", "truthfulqa"],
	help="Benchmarks to run"
	)
	parser.add_argument(
	"--output",
	type=str,
	default="evaluation_results.json",
	help="Output file for results"
	)
	parser.add_argument(
	"--num-samples",
	type=int,
	default=None,
	help="Number of samples to evaluate (for quick testing)"
	)
	parser.add_argument(
	"--device",
	type=str,
	default="cuda",
	help="Device to use"
	)
	parser.add_argument(
	"--batch-size",
	type=int,
	default=1,
	help="Batch size"
	)

	args = parser.parse_args()

	# Initialize evaluator
	evaluator = HelionEvaluator(
	model_path=args.model,
	device=args.device,
	batch_size=args.batch_size
	)

	# Run evaluations
	if "all" in args.benchmarks:
	results = evaluator.evaluate_all(
	output_file=args.output,
	num_samples=args.num_samples
	)
	else:
	results = {"model": args.model, "benchmarks": {}}

	if "mmlu" in args.benchmarks:
	results["benchmarks"]["mmlu"] = evaluator.evaluate_mmlu(args.num_samples)

	if "gsm8k" in args.benchmarks:
	results["benchmarks"]["gsm8k"] = evaluator.evaluate_gsm8k(args.num_samples)

	if "humaneval" in args.benchmarks:
	results["benchmarks"]["humaneval"] = evaluator.evaluate_humaneval(args.num_samples)

	if "truthfulqa" in args.benchmarks:
	results["benchmarks"]["truthfulqa"] = evaluator.evaluate_truthfulqa(args.num_samples)

	# Save results
	with open(args.output, 'w') as f:
	json.dump(results, f, indent=2)

	logger.info(f"Results saved to {args.output}")


	if __name__ == "__main__":
	main()