.. DO NOT EDIT. .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: .. "basic_usage/example_huggingface.py" .. LINE NUMBERS ARE GIVEN BELOW. .. only:: html .. note:: :class: sphx-glr-download-link-note :ref:`Go to the end ` to download the full example code. .. rst-class:: sphx-glr-example-title .. _sphx_glr_basic_usage_example_huggingface.py: Usage with Huggingface LLMs =============================== This example demonstrates how to work with Huggingface (HF) language models. As always, let's first import the required functionality. Remember to run :code:`pip install -U transformers datasets` .. GENERATED FROM PYTHON SOURCE LINES 9-30 .. code-block:: Python from collections import UserDict from collections.abc import MutableMapping import torch.utils.data as data_utils from datasets import Dataset from torch import Tensor, bfloat16, eye, manual_seed, no_grad from torch.nn import CrossEntropyLoss, Module from transformers import ( DataCollatorWithPadding, GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer, PreTrainedTokenizer, ) from curvlinops import GGNLinearOperator # make deterministic manual_seed(0) .. rst-class:: sphx-glr-script-out .. code-block:: none .. GENERATED FROM PYTHON SOURCE LINES 31-36 Data ---- We will use synthetic data for simplicity. But obviously this can be replaced with any HF dataloader. .. GENERATED FROM PYTHON SOURCE LINES 37-65 .. code-block:: Python tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.pad_token_id = tokenizer.eos_token_id data = [ {"text": "Today is hot, but I will manage!!!!", "label": 1}, {"text": "Tomorrow is cold", "label": 0}, {"text": "Carpe diem", "label": 1}, {"text": "Tempus fugit", "label": 1}, ] dataset = Dataset.from_list(data) def tokenize(row): """Tokenize a dataset row for GPT-2. Returns: Tokenized dictionary for the given row. """ return tokenizer(row["text"]) dataset = dataset.map(tokenize, remove_columns=["text"]) dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) dataloader = data_utils.DataLoader( dataset, batch_size=100, collate_fn=DataCollatorWithPadding(tokenizer) ) .. rst-class:: sphx-glr-script-out .. code-block:: none Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. Map: 0%| | 0/4 [00:00 None: """Initialize the wrapper with a tokenizer.""" super().__init__() config = GPT2Config.from_pretrained("gpt2") config.pad_token_id = tokenizer.pad_token_id config.num_labels = 2 self.hf_model = GPT2ForSequenceClassification.from_pretrained( "gpt2", config=config ) # For simplicity, only enable grad for the last layer for p in self.hf_model.parameters(): p.requires_grad = False for p in self.hf_model.score.parameters(): p.requires_grad = True def forward(self, data: MutableMapping) -> Tensor: """Run the model forward pass and move inputs to the correct device. Args: data: A dict-like data structure with `input_ids` inside. This is the default data structure assumed by Huggingface dataloaders. Returns: logits: An `(batch_size, n_classes)`-sized tensor of logits. """ device = next(self.parameters()).device input_ids = data["input_ids"].to(device) output_dict = self.hf_model(input_ids) return output_dict.logits model = MyGPT2(tokenizer).to(bfloat16) with no_grad(): logits = model(data) print(f"Logits shape: {logits.shape}") .. rst-class:: sphx-glr-script-out .. code-block:: none Loading weights: 0%| | 0/148 [00:00` .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: example_huggingface.py ` .. container:: sphx-glr-download sphx-glr-download-zip :download:`Download zipped: example_huggingface.zip ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_