Instructions to use Charlie81/SkipMoE with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Charlie81/SkipMoE with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Charlie81/SkipMoE")# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("Charlie81/SkipMoE", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Charlie81/SkipMoE with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Charlie81/SkipMoE" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Charlie81/SkipMoE", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/Charlie81/SkipMoE
- SGLang
How to use Charlie81/SkipMoE with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Charlie81/SkipMoE" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Charlie81/SkipMoE", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Charlie81/SkipMoE" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Charlie81/SkipMoE", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use Charlie81/SkipMoE with Docker Model Runner:
docker model run hf.co/Charlie81/SkipMoE
| # train.py | |
| # runs train_olmoe_adapter.py with parameters when called | |
| # #!/usr/bin/env python | |
| """ | |
| Run script for fine-tuning OlmoE with adapters on specific text domains. | |
| Handles argument parsing and configuration. | |
| """ | |
| import argparse | |
| import os | |
| import sys | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| from transformers import ( | |
| HfArgumentParser, | |
| TrainingArguments, | |
| ) | |
| class ScriptArguments: | |
| """ | |
| Arguments for the run script that aren't covered by TrainingArguments. | |
| """ | |
| model_path: str = field( | |
| default="allenai/OLMo-7B-Instruct", | |
| metadata={"help": "Path to the model to fine-tune"} | |
| ) | |
| output_dir: str = field( | |
| default="./output_olmoe_adapter", | |
| metadata={"help": "Directory to save the model and logs"} | |
| ) | |
| adapter_size: int = field( | |
| default=64, | |
| metadata={"help": "Size of the adapter layers"} | |
| ) | |
| dataset_name: str = field( | |
| default="mlfoundations/dclm-baseline-1.0", | |
| metadata={"help": "Name of the dataset to use"} | |
| ) | |
| max_steps: int = field( | |
| default=10000, | |
| metadata={"help": "Maximum number of training steps"} | |
| ) | |
| learning_rate: float = field( | |
| default=5e-5, | |
| metadata={"help": "Learning rate for fine-tuning"} | |
| ) | |
| per_device_batch_size: int = field( | |
| default=8, | |
| metadata={"help": "Batch size per device"} | |
| ) | |
| gradient_accumulation_steps: int = field( | |
| default=1, | |
| metadata={"help": "Number of steps to accumulate gradients"} | |
| ) | |
| # use_8bit: bool = field( | |
| # default=False, | |
| # metadata={"help": "Whether to use 8-bit precision"} | |
| # ) | |
| # use_4bit: bool = field( | |
| # default=False, | |
| # metadata={"help": "Whether to use 4-bit precision"} | |
| # ) | |
| def main(): | |
| # Parse command-line arguments | |
| parser = HfArgumentParser(ScriptArguments) | |
| args = parser.parse_args_into_dataclasses()[0] | |
| # Create output directory | |
| os.makedirs(args.output_dir, exist_ok=True) | |
| # Prepare command for training | |
| cmd = [ | |
| "python", | |
| "train_olmoe_adapter.py", | |
| # Model arguments | |
| f"--model_name_or_path={args.model_path}", | |
| f"--adapter_size={args.adapter_size}", | |
| "--freeze_base_model=True", # Always freeze the base model | |
| f"--checkpoint_dir={args.output_dir}", | |
| # Data arguments | |
| f"--dataset_name={args.dataset_name}", | |
| "--streaming=True", # Always stream for large datasets | |
| "--streaming_buffer_size=8192", | |
| "--max_seq_length=1024", | |
| # Training arguments | |
| f"--output_dir={args.output_dir}", | |
| f"--per_device_train_batch_size={args.per_device_batch_size}", | |
| f"--gradient_accumulation_steps={args.gradient_accumulation_steps}", | |
| f"--learning_rate={args.learning_rate}", | |
| f"--max_steps={args.max_steps}", | |
| "--warmup_steps=500", | |
| "--logging_steps=10", | |
| "--save_steps=1000", | |
| "--save_total_limit=2", | |
| "--dataloader_num_workers=4", | |
| "--seed=42", | |
| ] | |
| # Add precision flags if needed | |
| # if args.use_8bit: | |
| # cmd.append("--load_in_8bit") | |
| # if args.use_4bit: | |
| # cmd.append("--load_in_4bit") | |
| # Print the command for logging | |
| cmd_str = " ".join(cmd) | |
| print(f"Running command: {cmd_str}") | |
| # Execute the training script | |
| os.environ["PYTHONPATH"] = os.getcwd() | |
| ret = os.system(cmd_str) | |
| if ret != 0: | |
| print(f"Training failed with exit code {ret}") | |
| sys.exit(ret) | |
| print("Training completed successfully!") | |
| if __name__ == "__main__": | |
| main() |