diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c2baa6c --- /dev/null +++ b/.gitignore @@ -0,0 +1,182 @@ +__pycache__/ +.venv/ + +.env + +image/ +audio/ +video/ +dataframe/ + +static/generated +swarms/__pycache__ +venv +.DS_Store +.ruff_cache +.DS_STORE +swarms/agents/.DS_Store + +_build + + +.DS_STORE +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py +.DS_Store +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/LongNet/README.md b/LongNet/README.md deleted file mode 100644 index 61366f6..0000000 --- a/LongNet/README.md +++ /dev/null @@ -1,428 +0,0 @@ -# LongNet Implementation Research Document - -## System Analysis - -The LongNet architecture is based on the Transformers but with a twist to handle longer sequences. Its foundation is on the self-attention mechanism that maps a query and a set of keys and values to output. However, self-attention struggles with longer sequences due to its quadratic dependency on sequence length, which leads to computational inefficiencies. - -To resolve this, LongNet introduces the Dilated Attention method that splits the input into equally sized segments. Each segment is then sparsified along the sequence dimension by selecting the rows with a certain interval. The computation can be written as per the provided equations. - -Dilated attention reduces the computation cost significantly over vanilla attention. In practice, the segment size trades the globality of attention for efficiency, while the dilation with a certain size reduces the computation cost by approximating the attention matrix. - -To capture both long-range and short-range information efficiently, a mixture of dilated attentions with different segment sizes and dilation rates is implemented. - -LongNet also incorporates the multi-head attention mechanism, with each head having a distinct offset when selecting the query-key-value pairs. - -## Algorithmic Pseudocode - -The following is a high-level pseudocode for the LongNet model: - -```python -class LongNet: - Initialize parameters for LongNet - - def dilated_attention(self, input): - # Split the input into segments - input_segments = split(input) - - # Sparsify each segment along the sequence dimension - sparsified_segments = sparsify(input_segments) - - # Feed sparsified segments into attention - attended_segments = attention(sparsified_segments) - - # Scatter and concatenate the segments as output - output = scatter_and_concatenate(attended_segments) - - return output - - def multi_head_dilated_attention(self, input): - # For each head - for head in heads: - # Offset the query-key-value pairs - offset_qkv = offset(head) - - # Perform dilated attention - output = self.dilated_attention(offset_qkv) - - # Concatenate the outputs of different heads - final_output = concatenate(output) - - return final_output - - def forward(self, input): - output = self.multi_head_dilated_attention(input) - return output -``` - -## Actual Code - -Now, let's implement the LongNet model using PyTorch. - -```python -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.nn import MultiheadAttention - -class DilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size): - super(DilatedAttention, self).__init__() - self.d_model = d_model - self.num_heads = num_heads - self.dilation_rate = dilation_rate - self.segment_size = segment_size - self.attention = MultiheadAttention(embed_dim=d_model, num_heads=num_heads) - - def forward(self, x): - batch_size, seq_len, _ = x.shape - - # Split and sparsify - x = x.view(batch_size, -1, self.segment_size, self.d_model) - x = x[:, :, ::self.dilation_rate, :] - - # Perform attention - attn_output, _ = self.attention(x, x, x) - - # Scatter and concatenate - attn_output = attn_output.view(batch_size, -1, self.d_model) - return attn_output - -``` - - - -# Multi-Modal Dilation Attention -Creating a multi-modal version of DilatedAttention involves extending the attention mechanism to handle input from multiple modalities (e.g., text, audio, video) simultaneously. This involves processing each modality with its own dedicated attention mechanism and then combining the results in a meaningful way. - -Here's an architectural overview, requirements, simplifications, optimizations, pseudocode, and implementation for this: - -## Architectural Overview -In a multi-modal DilatedAttention, we first apply individual DilatedAttention modules to each modality. The outputs of these modules are then concatenated along the feature dimension, resulting in a tensor that includes attention features from all modalities. Finally, another DilatedAttention module is applied to the concatenated features, allowing cross-modality interactions to be captured. - -## Requirements -1. Individual attention mechanisms for each modality that understand the modality-specific data. -2. Mechanism to concatenate the modality-specific attention outputs. -3. Final attention mechanism that captures the cross-modality interactions. -4. Variable modality support, as not all modalities might be available for every data point. - -## Simplifications -1. All modalities are treated independently until the final concatenation and cross-modality attention step. -2. Modality-specific attention mechanisms are assumed to be capable of handling their respective data types. -3. The architecture is flexible with the number and types of modalities. It can work even if one or more modalities are missing. - -## Optimizations -1. Parallel Processing: Since the modality-specific attention computations are independent, they can be performed in parallel, leading to significant speedup. -2. Dynamic Computation: If a modality is not available for a certain data point, its computation can be skipped. -3. Attention Reduction: If the cross-modality attention proves too expensive, you could reduce the number of attention heads or lower the dimensionality of the attention space. - -## Pseudocode - -```pseudocode -function MULTIMODAL_DILATED_ATTENTION(input_modality_data): - for each modality in input_modality_data: - apply DILATED_ATTENTION to modality data - concatenate all modality attention outputs - apply CROSS_MODALITY_ATTENTION to concatenated outputs - return cross_modality_attention_outputs -end function -``` - -## Python Implementation with PyTorch -I will use the MultiwayNetwork that you've shared as a starting point, which serves as a wrapper that can process different splits of data separately. - -```python -class MultiModalDilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, causal=False, num_modalities=2): - super(MultiModalDilatedAttention, self).__init__() - - self.d_model = d_model - self.num_modalities = num_modalities - self.dilated_attns = nn.ModuleList( - [DilatedAttention(d_model, num_heads, dilation_rate, segment_size, dropout, causal) for _ in range(num_modalities)] - ) - self.cross_modality_attn = DilatedAttention(num_modalities * d_model, num_heads, dilation_rate, segment_size, dropout, causal) - - def forward(self, x): - modality_outputs = [] - for modality_data, attn in zip(x, self.dilated_attns): - modality_outputs.append(attn(modality_data)) - - cross_modality_input = torch.cat(modality_outputs, dim=-1) - cross_modality_output = self.cross_modality_attn(cross_modality_input) - - return cross_modality_output -``` -In this Python implementation, `x` is expected to be a list of tensors, each corresponding to a different modality. The `DilatedAttention` mechanism is applied to each modality independently, and the results are then concatenated and passed through a final `DilatedAttention` mechanism to capture cross-modality interactions. - -Please note that this is a fairly straightforward extension of DilatedAttention to multiple modalities and might require further enhancements to optimally deal with multi-modal data. For instance, attention normalization or scaling might be needed when concatenating modality-specific attention outputs. The choice of the final cross-modality attention mechanism could also be modified as per the needs of your specific application. - - -# Implementing SOTA methods - -Implementing Relative Position Bias and Rotary Position Embedding (XPOS) into the Dilated Attention module can add positional information to the model, enhancing the attention mechanism's ability to understand sequential dependencies. - -1. **Relative Position Bias**: This approach computes relative distances between the positions in the sequence and uses these distances to modify the attention scores. This allows the model to understand and utilize the relative position of tokens in the sequence, which is particularly important in many language tasks. - -2. **Rotary Position Embedding (XPOS)**: This approach is a variant of sinusoidal position embedding that applies a continuous rotation to each token’s embedding. This can be more efficient and flexible compared to standard position embeddings, as it does not require storing separate embeddings for each position. - -Both of these additions provide information about the order of tokens in a sequence, which can be crucial for many tasks. - -However, they add to the complexity of the model, which may have implications for computational cost and memory usage. Also, they may introduce challenges in training, as the model must learn to effectively integrate this positional information. - -**Implementation**: - -Let's integrate the `RelativePositionBias` and `XPOS` into the `DilatedAttention` class. - -```python -class DilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, casual=False): - super(DilatedAttention, self).__init__() - self.d_model = d_model - self.num_heads = num_heads - - self.dilation_rate = dilation_rate - self.segment_size = segment_size - - self.attention = FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype) - self.dropout = nn.Dropout(dropout) - self.casual = casual - - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - self.xpos = XPOS(head_dim=d_model//num_heads) - - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - def forward(self, x): - batch_size, seq_len, _ = x.shape - - # Apply XPOS - x = self.xpos(x) - - # Split and sparsify - x = x.view(batch_size, -1, self.segment_size, self.d_model) - x = x[:, :, :: self.dilation_rate, :] - - # Perform attention - attn_output, _ = self.attention(x, x, x) - - # Apply relative position bias - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - # if casual create a mask and apply to the output - if self.casual: - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - # apply dropout - attn_output = self.dropout(attn_output) - - # Scatter and concatenate - attn_output = attn_output.view(batch_size, -1, self.d_model) - return attn_output -``` - -**New Documentation**: - -`DilatedAttention` now includes `RelativePositionBias` and `XPOS` for incorporating positional information. - -`RelativePositionBias` adds a bias to the attention scores based on the relative distances between sequence positions. This mechanism is controlled by the `num_buckets`, `max_distance`, and `n_heads` parameters. - -`XPOS` applies rotary position embeddings to the input sequence, giving positional context to the model. - -Both features can be helpful in tasks where sequence order matters. They are automatically applied in the forward method, but keep in mind that these add to the model complexity. - -Use this model just like the previous version. The input to the forward method is a tensor with shape `(batch_size, seq_len, d_model)`, where `seq_len` is the sequence length and `d_model` is the model dimension. It returns an output tensor with the same shape. The model takes care of applying the `RelativePositionBias` and `XPOS` transformations automatically. - - -Taking into account the multi-head attention specifics and computational complexity from the paper, the `DilatedAttention` class can be updated as follows. Now we include an offset for each attention head when selecting the query, key, and value vectors. And the outputs of different heads are concatenated into a final output as described in the paper. - -The attention computation complexity estimation formulas from the paper are a great way to theoretically assess the efficiency of the algorithm, but they're not directly incorporated into the code since they don't affect the actual functionality of the algorithm. However, they can be used as a reference when testing and optimizing the algorithm. - -Please note, handling the details of distributed training (e.g. splitting input sequences across GPUs, collecting key-value pairs across devices etc.) as described in Section 3 of the paper would need to be implemented outside of this class, typically at a higher level in the model's training loop. - -```python -import torch -import torch.nn as nn -import torch.nn.functional as F - -from torchscale import XPOS, RelativePositionBias - -device = "cuda:0" # Replace this with your correct GPU device -dtype=torch.float16 - -class DilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, casual=False, use_xpos=False, use_rel_pos_bias=False): - super(DilatedAttention, self).__init__() - self.d_model = d_model - self.num_heads = num_heads - - self.dilation_rate = dilation_rate - self.segment_size = segment_size - - self.attentions = nn.ModuleList([FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype) for _ in range(self.dilation_rate)]) - self.dropout = nn.Dropout(dropout) - self.casual = casual - - self.use_xpos = use_xpos - self.use_rel_pos_bias = use_rel_pos_bias - - if use_xpos: - self.xpos = XPOS(head_dim=d_model//num_heads) - if use_rel_pos_bias: - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - def forward(self, x): - batch_size, seq_len, _ = x.shape - - if self.use_xpos: - x = self.xpos(x) - - # Collect outputs from each attention head - all_head_outputs = [] - for head_idx, attention in enumerate(self.attentions): - offset = head_idx % self.dilation_rate - - x_ = x[:, offset::self.dilation_rate, :] # Apply offset for each head - x_ = x_.contiguous().view(batch_size, -1, self.segment_size, self.d_model) - - attn_output, _ = attention(x_, x_, x_) - if self.use_rel_pos_bias: - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - if self.casual: - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - attn_output = self.dropout(attn_output) - - # Resize back to original size - attn_output_resized = torch.zeros((batch_size, seq_len, self.d_model), device=device, dtype=dtype) - attn_output_resized[:, offset::self.dilation_rate, :] = attn_output.contiguous().view(batch_size, -1, self.d_model) - - all_head_outputs.append(attn_output_resized) - - # Concatenate the outputs of different heads - outputs_concatenated = torch.cat(all_head_outputs, dim=-1) - - return outputs_concatenated -``` - -The offsets are now properly handled, creating a different "view" of the input for each attention head. Also, the outputs from each head are concatenated together instead of being summed, which is more in line with the traditional multi-head attention mechanism. - -However, there are a few important caveats to keep in mind: -1. The current implementation assumes the number of attention heads equals the dilation rate. If this is not the case, you will have to adjust the implementation accordingly. -2. It also assumes that the input sequence length is divisible by the dilation rate, which might not always be the case in practice. In real situations, you would probably need to handle this by properly padding or truncating the input sequence. -3. Depending on the specific context, applying dropout to the output of each head individually might not be the best approach. It could be beneficial to apply dropout after concatenating the outputs together. -4. I've retained the position encoding and relative position bias options, although the paper doesn't seem to mention them. If you're trying to replicate the paper's results exactly, you might want to disable them. - -Here's the updated code for a distributed version of the `DilatedAttention` class. I want to emphasize that the distributed training aspect should be handled in your training loop or pipeline, as it involves the overall data and model distribution strategy that is beyond the scope of this single attention module. The provided code is just a simple demonstration of how to collect the key-value pairs from different GPUs before computing the attention: - -```python -import torch -import torch.distributed as dist -import torch.nn as nn -import torch.nn.functional as F - -from torchscale import XPOS, RelativePositionBias - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -class DistributedDilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, casual=False, use_xpos=False, use_rel_pos_bias=False): - super(DistributedDilatedAttention, self).__init__() - self.d_model = d_model - self.num_heads = num_heads - - self.dilation_rate = dilation_rate - self.segment_size = segment_size - - self.attentions = nn.ModuleList([FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device) for _ in range(self.dilation_rate)]) - self.dropout = nn.Dropout(dropout) - self.casual = casual - - self.use_xpos = use_xpos - self.use_rel_pos_bias = use_rel_pos_bias - - if use_xpos: - self.xpos = XPOS(head_dim=d_model//num_heads) - if use_rel_pos_bias: - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - def forward(self, x): - batch_size, seq_len, _ = x.shape - - if self.use_xpos: - x = self.xpos(x) - - # Collect outputs from each attention head - all_head_outputs = [] - for head_idx, attention in enumerate(self.attentions): - offset = head_idx % self.dilation_rate - - x_ = x[:, offset::self.dilation_rate, :] # Apply offset for each head - x_ = x_.contiguous().view(batch_size, -1, self.segment_size, self.d_model) - - # compute attention locally, gather the key-value pairs before computing the attention - attn_output, _ = attention(x_, x_, x_) - dist.all_gather(attn_output, attn_output) - - if self.use_rel_pos_bias: - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - if self.casual: - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - attn_output = self.dropout(attn_output) - - # Resize back to original size - attn_output_resized = torch.zeros((batch_size, seq_len, self.d_model), device=device) - attn_output_resized[:, offset::self.dilation_rate, :] = attn_output.contiguous().view(batch_size, -1, self.d_model) - - all_head_outputs.append(attn_output_resized) - - # Concatenate the outputs of different heads - outputs_concatenated = torch.cat(all_head_outputs, dim=-1) - - return outputs_concatenated -``` -This code is a simple attempt to implement the distributed strategy described in the paper, and it's far from a complete solution. In a real-world setting, you would likely need to handle many additional complexities. For instance: - -- Checking the availability of multiple GPUs and appropriately distributing the computation among them. -- Efficiently handling GPU memory, to prevent out-of-memory errors when the sequence length is large. -- Dealing with potential communication overhead when collecting the key-value pairs from different GPUs. -- Handling edge cases where the sequence length is not perfectly divisible by the number of GPUs or the dilation rate. -- Integrating with a larger model architecture and training loop, including handling the backward pass and parameter updates. -- Tuning the performance to fully take advantage of the potential speed-up offered by distributed computing. - -Furthermore, the current implementation assumes the use of PyTorch's built-in distributed package. Depending on your specific requirements and computing environment, you might want to use a different package or even write your own custom distributed computing logic. - -Therefore, I'd highly recommend carefully studying PyTorch's [official documentation on distributed computing](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) and seeking expert advice before attempting to scale this to a production-level implementation. - - - -The provided code implements the Dilated Attention mechanism with multiple heads and different dilation rates. However, it seems to be missing some key components mentioned in the paper: - -Mixture of Dilated Attentions: The paper suggests implementing a mixture of dilated attentions with different segment sizes and dilation rates. This is not implemented in the provided code. The weights for the mixture are calculated dynamically based on the denominator of the attention softmax, which is also not present in the code. - -Geometric Sequences for Segment Sizes and Dilation Rates: The paper suggests setting the segment sizes and dilation rates to geometric sequences for an exponential attentive field. This is not reflected in the provided code. - -Distributed Training Algorithm: The paper describes a distributed training algorithm that allows the model to scale up to 1 billion tokens. This is a significant feature that is not implemented in the provided code. - -Here's how we could implement these missing pieces: - -Mixture of Dilated Attentions: We could modify the forward method to compute the output for each combination of segment size and dilation rate, and then combine these outputs using the dynamic weights. The dynamic weights could be computed by adding a softmax layer that takes the denominator of the attention softmax as input. - -Geometric Sequences for Segment Sizes and Dilation Rates: We could modify the __init__ method to generate geometric sequences for the segment sizes and dilation rates. This could be done using the torch.logspace function. - -Distributed Training Algorithm: Implementing this feature would require significant changes to the code. We would need to split the input sequence across multiple devices, compute the attention on each device, and then gather the results. This could be done using PyTorch's distributed computing features, such as torch.nn.DataParallel or torch.nn.parallel.DistributedDataParallel. - -Please note that implementing these features would require a deep understanding of the Dilated Attention mechanism and the specific requirements of your application. It would also likely require extensive testing and debugging to ensure that the implementation is correct and efficient. - diff --git a/LongNet/__init__.py b/LongNet/__init__.py deleted file mode 100644 index 6be512f..0000000 --- a/LongNet/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from LongNet.attention import DilatedAttention -from LongNet.model import LongNetTokenizer, LongNet, DecoderConfig, Decoder -from LongNet.iterations.DilatedAttentionOP import DilatedAttentionOP -from LongNet.iterations.DynamicDilatedAttention import DynamicDilatedAttention \ No newline at end of file diff --git a/LongNet/attend.py b/LongNet/attend.py deleted file mode 100644 index 6d43253..0000000 --- a/LongNet/attend.py +++ /dev/null @@ -1,237 +0,0 @@ -from torch._C import dtype -# !pip install torch -# !pip install einops - -import math -from collections import namedtuple -from functools import wraps -from packaging import version - -import torch -from torch import nn, einsum, Tensor -import torch.nn.functional as F - -from einops import rearrange - -from dataclasses import dataclass - -# constants - -EfficientAttentionConfig = namedtuple('EfficientAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient']) - -# helpers - -def exists(val): - return val is not None - -def once(fn): - called = False - @wraps(fn) - def inner(x): - nonlocal called - if called: - return - called = True - return fn(x) - return inner - -print_once = once(print) - -# main class - - -@dataclass -class Intermediates: - qk_similarities: Tensor = None - pre_softmax_attn: Tensor = None - post_softmax_attn: Tensor = None - - def to_tuple(self): - return (self.qk_similarities, self.pre_softmax_attn, self.post_softmax_attn) - -# helpers - - -class FlashAttention(nn.Module): - def __init__( - self, - causal = False, - dropout = 0., - flash = True - ): - super().__init__() - - self.dropout = dropout - self.attn_dropout = nn.Dropout(dropout) - - self.causal = causal - self.flash = flash - assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above' - - # determine efficient attention configs for cuda and cpu - - self.cpu_config = EfficientAttentionConfig(True, True, True) - self.cuda_config = None - - if not torch.cuda.is_available() or not flash: - return - - device_properties = torch.cuda.get_device_properties(torch.device('cuda')) - - if device_properties.major == 8 and device_properties.minor == 0: - print_once('A100 GPU detected, using flash attention if input tensor is on cuda') - self.cuda_config = EfficientAttentionConfig(True, False, False) - else: - print_once('Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda') - self.cuda_config = EfficientAttentionConfig(False, True, True) - - def get_mask(self, i, j, device): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 1) - - - def flash_attn( - self, - q, k, v, - mask = None, - attn_bias = None - ): - batch, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device - - # Recommended for multi-query single-key-value attention by Tri Dao - # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64]) - - if k.ndim == 3: - k = rearrange(k, 'b ... -> b 1 ...').expand_as(q) - - if v.ndim == 3: - v = rearrange(v, 'b ... -> b 1 ...').expand_as(q) - - # handle scale - by default they scale by dim_head ** -0.5, but need to take care if using cosine sim attention - # Check if mask exists and expand to compatible shape - # The mask is B L, so it would have to be expanded to B H N L - - causal = self.causal - - if exists(mask): - assert mask.ndim == 4 - mask = mask.expand(batch, heads, q_len, k_len) - - # manually handle causal mask, if another mask was given - - if causal: - causal_mask = self.create_causal_mask(q_len, k_len, device = device) - mask = mask & ~causal_mask - causal = False - - # handle alibi positional bias - # convert from bool to float - - if exists(attn_bias): - attn_bias = rearrange(attn_bias, 'h i j -> 1 h i j').expand(batch, -1, -1, -1) - - # if mask given, the mask would already contain the causal mask from above logic - # otherwise, if no mask given but still causal, mask out alibi positional bias to a large negative number - - mask_value = -torch.finfo(q.dtype).max - - if exists(mask): - attn_bias = attn_bias.masked_fill(~mask, mask_value // 2) - elif causal: - causal_mask = self.create_causal_mask(q_len, k_len, device = device) - attn_bias = attn_bias.masked_fill(causal_mask, mask_value // 2) - causal = False - - # scaled_dot_product_attention handles attn_mask either as bool or additive bias - # make it an additive bias here - - mask = attn_bias - - # Check if there is a compatible device for flash attention - - config = self.cuda_config if is_cuda else self.cpu_config - - # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale - - with torch.backends.cuda.sdp_kernel(**config._asdict()): - out = F.scaled_dot_product_attention( - q, k, v, - attn_mask = mask, - dropout_p = self.dropout if self.training else 0., - is_causal = causal - ) - - return out - - def forward(self, q, k, v, mask = None, attn_bias = None): - """ - einstein notation - b - batch - h - heads - n, i, j - sequence length (base sequence length, source, target) - d - feature dimension - """ - - q_len, k_len, device = q.shape[-2], k.shape[-2], q.device - - scale = q.shape[-1] ** -0.5 - - kv_einsum_eq = 'b j d' if k.ndim == 3 else 'b h j d' - - if self.flash: - return self.flash_attn(q, k, v, mask = mask, attn_bias = attn_bias) - - # similarity - - sim = einsum(f"b h i d, {kv_einsum_eq} -> b h i j", q, k) * scale - - # attention bias - - if exists(attn_bias): - sim = sim + attn_bias - - # causal mask - - if self.causal: - causal_mask = self.get_mask(q_len, k_len, device) - sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max) - - # attention - - attn = sim.softmax(dim=-1) - attn = self.attn_dropout(attn) - - # aggregate values - - out = einsum(f"b h i j, {kv_einsum_eq} -> b h i d", attn, v) - - return out - -import torch -from collections import namedtuple -from einops import rearrange - -EfficientAttentionConfig = namedtuple('EfficientAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient']) - -class FlashMHA(nn.Module): - def __init__(self, embed_dim, num_heads, bias=True, batch_first=True, dropout=0.0, - causal=False, device=None, dtype=None) -> None: - assert batch_first - factory_kwargs = {'device': device, 'dtype': dtype} - super().__init__() - self.embed_dim = embed_dim - self.causal = causal - - self.num_heads = num_heads - assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads" - self.head_dim = self.embed_dim // num_heads - assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8" - - self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs) - self.inner_attn = FlashAttention(dropout=dropout, causal=causal) - self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs) - - def forward(self, query, key, value): - qkv = self.Wqkv(query) - q, k, v = rearrange(qkv, 'b s (three h d) -> three b s h d', three=3, h=self.num_heads, d=self.head_dim).unbind(dim=0) - context = self.inner_attn(q, k, v) - return self.out_proj(rearrange(context, 'b s h d -> b s (h d)')) diff --git a/LongNet/attention.py b/LongNet/attention.py deleted file mode 100644 index abfa80f..0000000 --- a/LongNet/attention.py +++ /dev/null @@ -1,158 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.nn.parallel import DataParallel - -from LongNet.utils import XPOS, RelativePositionBias - -from LongNet.attend import FlashMHA - -device = "cuda:0" -dtype=torch.float16 - - - -# Define the attention module -class DilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, casual=False, use_xpos=False, use_rel_pos_bias=False, Distributed=False): - super(DilatedAttention, self).__init__() - - #checking put parameter types - assert isinstance(d_model, int) and d_model > 0, 'd_model should be positive integer' - assert isinstance(num_heads, int) and num_heads > 0, 'num_heads should be positive integer' - assert isinstance(dilation_rate, int) and dilation_rate > 0, 'dilation_rate should be a positive integer' - - assert isinstance(segment_size, int) and segment_size > 0, 'segment_size should be a positive integer' - assert isinstance(dropout, float) and 0.0 <= dropout <= 1.0, 'dropout should be a positive integer' - assert isinstance(casual, bool), 'casual should be a boolean value' - - assert isinstance(use_xpos, bool), 'use_xpos should be a boolean value ' - assert isinstance(use_rel_pos_bias, bool), 'use_rel_pos_bias should be a boolean value' - - # Initialize parameters - self.d_model = d_model # model dimension - self.num_heads = num_heads # number of attention heads - self.dilation_rate = dilation_rate # dilation rate - self.segment_size = segment_size # segment size - - self.dropout = nn.Dropout(dropout) - # If casual attention is used - self.casual = casual - # If using positional encoding - self.use_xpos = use_xpos - # If using relative positional bias - self.use_rel_pos_bias = use_rel_pos_bias - self.distributed = Distributed - - # Initialize attention for each head with dilation - # Initialize the attention heads with or without DataParallel based on the value of 'distributed' - if self.distributed: - self.attentions = nn.ModuleList([DataParallel(FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype)) for _ in range(self.dilation_rate)]) - else: - self.attentions = nn.ModuleList([FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype) for _ in range(self.dilation_rate)]) - - - # If using positional encoding, initialize it - if use_xpos: - self.xpos = XPOS(head_dim=d_model//num_heads) - - # If using relative positional bias, initialize it - if use_rel_pos_bias: - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - - # Initialize softmax for later use in weights - self.softmax = nn.Softmax(dim=-1) - - # Function to get mask for casual attention - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - # Forward function - def forward(self, x): - # Get batch size, sequence length and model dimension - batch_size, seq_len, _ = x.shape - - # If using positional encoding, add it - if self.use_xpos: - x = self.xpos(x) - - # Initialize list to store outputs from each attention head - all_head_outputs = [] - - # For each attention head - for head_idx, attention in enumerate(self.attentions): - # Calculate offset for this head - offset = head_idx % self.dilation_rate - - # Apply offset and segment for this head - x_ = x[:, offset::self.dilation_rate, :] - x_ = x_.contiguous().view(batch_size, -1, self.segment_size, self.d_model) - - elements_attns = [attention(element.to(dtype), element.to(dtype), element.to(dtype)) for element in x_] - attn_output = torch.cat(elements_attns, dim=1) - - # If using relative positional bias, add it - if self.use_rel_pos_bias: - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - # If using casual attention, apply mask - if self.casual: - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - # Apply dropout - attn_output = self.dropout(attn_output) - - # Resize back to original size - attn_output_resized = torch.zeros((batch_size, seq_len, self.d_model), device=device, dtype=dtype) - attn_output_resized[:, offset::self.dilation_rate, :] = attn_output.contiguous().view(batch_size, -1, self.d_model) - - # Append output to list of all outputs - all_head_outputs.append(attn_output_resized) - - # Calculate the weights for the different dilated attentions - weights = self.softmax(torch.tensor([1.0 / self.dilation_rate for _ in range(self.dilation_rate)], device=device, dtype=dtype)) - - # Apply the weights to the outputs of the different heads - outputs_weighted = sum(w * out for w, out in zip(weights, all_head_outputs)) - - # Return the weighted outputs - return outputs_weighted - - - - - - - - -class LongNetTransformer(nn.Module): - def __init__(self, d_model, num_heads, dilation_rates, segment_sizes): - super(LongNetTransformer, self).__init__() - assert len(dilation_rates) == len(segment_sizes), "dilation_rates and segment_sizes should have the same length" - - - self.d_model = d_model - self.num_heads = num_heads - self.dilation_rates = dilation_rates - self.segment_sizes = segment_sizes - - self.dilated_attention_layers = nn.ModuleList( - [DilatedAttention(d_model, num_heads, dilation_rate, segment_size)] - for dilation_rate, segment_size in zip(dilation_rates, segment_sizes) - ) - - def forward(self, x): - #accumlate outputs from different layers - outputs = [] - - #process each dilated attention layer - for i in range(len(self.dilated_attention_layers)): - output = self.dilated_attention_layers[i](x) - outputs.append(output) - - #combine the outputs - output = torch.sum(torch.stack(outputs), dim=0) - - return output - \ No newline at end of file diff --git a/LongNet/iterations/BlocksparseDilatedAttention.py b/LongNet/iterations/BlocksparseDilatedAttention.py deleted file mode 100644 index de53f4d..0000000 --- a/LongNet/iterations/BlocksparseDilatedAttention.py +++ /dev/null @@ -1,78 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -from LongNet.utils import XPOS, RelativePositionBias -from LongNet.attention import FlashMHA - -# Replace this with your correct GPU device -device = "cuda:0" -dtype=torch.float16 - - - - - -# from flash_attn.flash_blocksparse_attention import FlashBlocksparseMHA -#perhaps integrate integrate dynamic sparse attention -class BlocksparseDilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, sparsity_config, dropout=0.0, causal=False, use_xpos=False, use_rel_pos_bias=False): - super(BlocksparseDilatedAttention, self).__init__() - self.d_model = d_model - self.num_heads = num_heads - - self.dilation_rate = dilation_rate - self.segment_size = segment_size - - self.attentions = nn.ModuleList([FlashBlocksparseMHA(embed_dim=d_model, num_heads=num_heads, sparsity_config=sparsity_config, device=device, dtype=dtype) for _ in range(self.dilation_rate)]) - self.dropout = nn.Dropout(dropout) - self.causal = causal - - self.use_xpos = use_xpos - self.use_rel_pos_bias = use_rel_pos_bias - - if use_xpos: - self.xpos = XPOS(head_dim=d_model//num_heads) - if use_rel_pos_bias: - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - def forward(self, x): - batch_size, seq_len, _ = x.shape - - if self.use_xpos: - x = self.xpos(x) - - # Collect outputs from each attention head - all_head_outputs = [] - for head_idx, attention in enumerate(self.attentions): - offset = head_idx % self.dilation_rate - - x_ = x[:, offset::self.dilation_rate, :] # Apply offset for each head - x_ = x_.contiguous().view(batch_size, -1, self.segment_size, self.d_model) - - attn_output, _ = attention(x_, x_, x_) - if self.use_rel_pos_bias: - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - if self.causal: - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - attn_output = self.dropout(attn_output) - - # Resize back to original size - attn_output_resized = torch.zeros((batch_size, seq_len, self.d_model), device=device, dtype=dtype) - attn_output_resized[:, offset::self.dilation_rate, :] = attn_output.contiguous().view(batch_size, -1, self.d_model) - - all_head_outputs.append(attn_output_resized) - - # Concatenate the outputs of different heads - outputs_concatenated = torch.cat(all_head_outputs, dim=-1) - - return outputs_concatenated - - - diff --git a/LongNet/iterations/DilatedAttentionOP.py b/LongNet/iterations/DilatedAttentionOP.py deleted file mode 100644 index 35e0b61..0000000 --- a/LongNet/iterations/DilatedAttentionOP.py +++ /dev/null @@ -1,90 +0,0 @@ - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from LongNet.utils import XPOS, RelativePositionBias -from LongNet.attention import FlashMHA - -# Replace this with your correct GPU device -device = "cuda:0" -dtype=torch.float16 - - - - - -class DilatedAttentionOP(nn.Module): - def __init__(self, d_model, num_heads, dilation_rates, segment_sizes, dropout=0.0, casual=False, use_xpos=False, use_rel_pos_bias=False): - super(DilatedAttentionOP, self).__init__() - self.d_model = d_model - self.num_heads = num_heads - - self.dilation_rates = dilation_rates - self.segment_sizes = segment_sizes - - self.attentions = nn.ModuleList([FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype) for _ in range(len(dilation_rates))]) - self.dropout = nn.Dropout(dropout) - self.casual = casual - - self.use_xpos = use_xpos - self.use_rel_pos_bias = use_rel_pos_bias - - if use_xpos: - self.xpos = XPOS(head_dim=d_model//num_heads) - if use_rel_pos_bias: - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - - self.softmax = nn.Softmax(dim=-1) - - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - def forward(self, x): - batch_size, seq_len, _ = x.shape - - if self.use_xpos: - x = self.xpos(x) - - #collect outputs from each attention head - all_head_outputs = [] - for head_idx, attention in enumerate(self.attentions): - dilation_rate = self.dilation_rates[head_idx] - segment_size = self.segment_sizes[head_idx] - - for offset in range(dilation_rate): - x_ = x[:, offset::dilation_rate, :] # Apply offset for each head - x_ = x_.contiguous().view(batch_size, -1, segment_size, self.d_model) - - elements_attns = [] - - for idx in range(x_.shape[1]): - element = x_[:, idx, :, :].to(dtype) - element_attn, _ = attention(element, element, element) - - elements_attns.append(element_attn) - - attn_output = torch.cat(elements_attns, dim=1) - - if self.use_rel_pos_bias: - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - if self.casual: # TODO: Look into it - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - attn_output = self.dropout(attn_output) - - #resize back to original size - attn_output_resized = torch.zeros((batch_size, seq_len, self.d_model), device=device, dtype=dtype) - attn_output_resized[:, offset::dilation_rate, :] = attn_output.contiguous().view(batch_size, -1, self.d_model) - - all_head_outputs.append(attn_output_resized) - - #calculate the weights for the different dilated attentions - weights = self.softmax(torch.tensor([1.0 / len(self.dilation_rates) for _ in range(len(self.dilation_rates))], device=device, dtype=dtype)) - - #apply the weights to the outputs of the different heads - outputs_weighted = sum(w * out for w, out in zip(weights, all_head_outputs)) - - return outputs_weighted \ No newline at end of file diff --git a/LongNet/iterations/DilatedAttentionOld.py b/LongNet/iterations/DilatedAttentionOld.py deleted file mode 100644 index 3be730f..0000000 --- a/LongNet/iterations/DilatedAttentionOld.py +++ /dev/null @@ -1,93 +0,0 @@ - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from LongNet.utils import XPOS, RelativePositionBias -from LongNet.attention import FlashMHA - -# Replace this with your correct GPU device -device = "cuda:0" -dtype=torch.float16 - - - - -#add alibi, qk layer norm, one write head, multihway, -class DilatedAttentionold(nn.Module): - """ - Dilated Attention Module. - - Arguments: - d_model: The dimension of the attention layers. - num_heads: The number of attention heads. - dilation_rate: The dilation rate for dilated attention. - segment_size: The segment size for dilated attention. - dropout (optional): The dropout probability. Default: 0.0 - casual (optional): If set to True, the attention mechanism is casual. Default: False - use_xpos (optional): If set to True, xpos is used for positional encoding. Default: False - use_rel_pos_bias (optional): If set to True, relative position bias is used in the attention mechanism. Default: False - - Usage: - The `DilatedAttention` class can be used as a module for neural networks and is especially suited for transformer architectures. - - Example: - attention = DilatedAttention(d_model=512, num_heads=8, dilation_rate=2, segment_size=64, use_xpos=True, use_rel_pos_bias=True) - output = attention(input_tensor) - - This will return the output tensor after applying dilated attention. The `use_xpos` and `use_rel_pos_bias` parameters allow for switching on positional encoding and relative positional bias respectively. - """ - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, casual=False, use_xpos=False, use_rel_pos_bias=False): - super(DilatedAttentionold, self).__init__() - self.d_model = d_model - self.num_heads = num_heads - - self.dilation_rate = dilation_rate - self.segment_size = segment_size - - self.attention = FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype) - self.dropout = nn.Dropout(dropout) - self.casual = casual - - self.use_xpos = use_xpos - self.use_rel_pos_bias = use_rel_pos_bias - - if use_xpos: - self.xpos = XPOS(head_dim=d_model//num_heads) - if use_rel_pos_bias: - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - - #head offsets - self.head_offsets = nn.Parameter(torch.randn(num_heads, d_model)) - - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - def forward(self, x): - batch_size, seq_len, _ = x.shape - - if self.use_xpos: - x = self.xpos(x) - - # Split and sparsify - x = x.view(batch_size, -1, self.segment_size, self.d_model) - x = x[:, :, :: self.dilation_rate, :] - - # Perform attention - attn_output, _ = self.attention(x, x, x) - - #if use rel pos => apply relative positioning bias - if self.use_rel_pos_bias: - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - # if casual create a mask and apply to the output - if self.casual: - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - # apply dropout - attn_output = self.dropout(attn_output) - - # Scatter and concatenate - attn_output = attn_output.view(batch_size, -1, self.d_model) - return attn_output diff --git a/LongNet/iterations/DistributedDilatedAttention.py b/LongNet/iterations/DistributedDilatedAttention.py deleted file mode 100644 index eadbe5e..0000000 --- a/LongNet/iterations/DistributedDilatedAttention.py +++ /dev/null @@ -1,77 +0,0 @@ -#distributed dilated attention based on second iteration -import torch.distributed as dist -import torch -import torch.nn as nn -import torch.nn.functional as F - -from LongNet.utils import XPOS, RelativePositionBias -from LongNet.attention import FlashMHA - -# Replace this with your correct GPU device -device = "cuda:0" -dtype=torch.float16 - - - - -class DistributedDilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, casual=False, use_xpos=False, use_rel_pos_bias=False): - super(DistributedDilatedAttention, self).__init__() - self.d_model = d_model - self.num_heads = num_heads - - self.dilation_rate = dilation_rate - self.segment_size = segment_size - - self.attentions = nn.ModuleList([FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device) for _ in range(self.dilation_rate)]) - self.dropout = nn.Dropout(dropout) - self.casual = casual - - self.use_xpos = use_xpos - self.use_rel_pos_bias = use_rel_pos_bias - - if use_xpos: - self.xpos = XPOS(head_dim=d_model//num_heads) - if use_rel_pos_bias: - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - def forward(self, x): - batch_size, seq_len, _ = x.shape - - if self.use_xpos: - x = self.xpos(x) - - # Collect outputs from each attention head - all_head_outputs = [] - for head_idx, attention in enumerate(self.attentions): - offset = head_idx % self.dilation_rate - - x_ = x[:, offset::self.dilation_rate, :] # Apply offset for each head - x_ = x_.contiguous().view(batch_size, -1, self.segment_size, self.d_model) - - # compute attention locally, gather the key-value pairs before computing the attention - attn_output, _ = attention(x_, x_, x_) - dist.all_gather(attn_output, attn_output) - - if self.use_rel_pos_bias: - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - if self.casual: - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - attn_output = self.dropout(attn_output) - - # Resize back to original size - attn_output_resized = torch.zeros((batch_size, seq_len, self.d_model), device=device) - attn_output_resized[:, offset::self.dilation_rate, :] = attn_output.contiguous().view(batch_size, -1, self.d_model) - - all_head_outputs.append(attn_output_resized) - - # Concatenate the outputs of different heads - outputs_concatenated = torch.cat(all_head_outputs, dim=-1) - - return outputs_concatenated diff --git a/LongNet/iterations/DynamicDilatedAttention.py b/LongNet/iterations/DynamicDilatedAttention.py deleted file mode 100644 index c9bb41c..0000000 --- a/LongNet/iterations/DynamicDilatedAttention.py +++ /dev/null @@ -1,87 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -from LongNet.utils import XPOS, RelativePositionBias -from LongNet.attention import FlashMHA - -# Replace this with your correct GPU device -device = "cuda:0" -dtype=torch.float16 - - - - -class DynamicDilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, num_rates, dropout=0.0, casual=False, use_xpos=False, use_rel_pos_bias=False): - super(DynamicDilatedAttention, self).__init__() - self.d_model = d_model - self.num_heads = num_heads - - # Generate geometric sequences for dilation rates and segment sizes - self.dilation_rates = torch.logspace(start=0, end=num_rates-1, steps=num_rates, base=2, dtype=torch.int, device=device) - self.segment_sizes = torch.logspace(start=0, end=num_rates-1, steps=num_rates, base=2, dtype=torch.int, device=device) - - self.attentions = nn.ModuleList([FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype) for _ in range(num_rates)]) - self.dropout = nn.Dropout(dropout) - self.casual = casual - - self.use_xpos = use_xpos - self.use_rel_pos_bias = use_rel_pos_bias - - if use_xpos: - self.xpos = XPOS(head_dim=d_model//num_heads) - if use_rel_pos_bias: - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - - self.softmax = nn.Softmax(dim=-1) - - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - def forward(self, x): - batch_size, seq_len, _ = x.shape - - if self.use_xpos: - x = self.xpos(x) - - #collect outputs from each attention head - all_head_outputs = [] - all_softmax_denominators = [] - for head_idx, attention in enumerate(self.attentions): - dilation_rate = self.dilation_rates[head_idx] - segment_size = self.segment_sizes[head_idx] - - for offset in range(dilation_rate): - x_ = x[:, offset::dilation_rate, :] # Apply offset for each head - x_ = x_.contiguous().view(batch_size, -1, segment_size, self.d_model) - - attn_output, attn_weights = attention(x_, x_, x_) - if self.use_rel_pos_bias: - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - if self.casual: - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - attn_output = self.dropout(attn_output) - - #resize back to original size - attn_output_resized = torch.zeros((batch_size, seq_len, self.d_model), device=device, dtype=dtype) - attn_output_resized[:, offset::dilation_rate, :] = attn_output.contiguous().view(batch_size, -1, self.d_model) - - all_head_outputs.append(attn_output_resized) - all_softmax_denominators.append(attn_weights.sum(dim=-1)) - - #calculate the weights for the different dilated attentions - weights = self.softmax(torch.stack(all_softmax_denominators, dim=-1)) - - #apply the weights to the outputs of the different heads - outputs_weighted = sum(w.unsqueeze(-1) * out for w, out in zip(weights, all_head_outputs)) - - return outputs_weighted - - - - - diff --git a/LongNet/iterations/MultiModal.py b/LongNet/iterations/MultiModal.py deleted file mode 100644 index fb3d66b..0000000 --- a/LongNet/iterations/MultiModal.py +++ /dev/null @@ -1,99 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -from LongNet.utils import XPOS, RelativePositionBias -from LongNet.attention import FlashMHA - -device = "cuda:0" -dtype=torch.float16 - - -#second iteration the weighted sum of the different dilated + offsets for the different heads -class DilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, casual=False, use_xpos=False, use_rel_pos_bias=False): - super(DilatedAttention, self).__init__() - self.d_model = d_model - self.num_heads = num_heads - - self.dilation_rate = dilation_rate - self.segment_size = segment_size - - self.attentions = nn.ModuleList([FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype) for _ in range(self.dilation_rate)]) - self.dropout = nn.Dropout(dropout) - self.casual = casual - - self.use_xpos = use_xpos - self.use_rel_pos_bias = use_rel_pos_bias - - if use_xpos: - self.xpos = XPOS(head_dim=d_model//num_heads) - if use_rel_pos_bias: - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - - self.softmax = nn.Softmax(dim=-1) - - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - def forward(self, x): - batch_size, seq_len, _ = x.shape - - if self.use_xpos: - x = self.xpos(x) - - #collect outputs from each attention head - all_head_outputs = [] - for head_idx, attention in enumerate(self.attentions): - offset = head_idx % self.dilation_rate - - x_ = x[:, offset::self.dilation_rate, :] # Apply offset for each head - x_ = x_.contiguous().view(batch_size, -1, self.segment_size, self.d_model) - - attn_output, _ = attention(x_, x_, x_) - if self.use_rel_pos_bias: - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - if self.casual: - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - attn_output = self.dropout(attn_output) - - #resize back to original size - attn_output_resized = torch.zeros((batch_size, seq_len, self.d_model), device=device, dtype=dtype) - attn_output_resized[:, offset::self.dilation_rate, :] = attn_output.contiguous().view(batch_size, -1, self.d_model) - - all_head_outputs.append(attn_output_resized) - - #calculate the weights for the different dilated attentions - weights = self.softmax(torch.tensor([1.0 / self.dilation_rate for _ in range(self.dilation_rate)], device=device, dtype=dtype)) - - #apply the weights to the outputs of the different heads - outputs_weighted = sum(w * out for w, out in zip(weights, all_head_outputs)) - - return outputs_weighted - - - - -class MultiModalDilationAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, casual=False, num_modalities=2): - super(MultiModalDilationAttention, self).__init__() - - self.d_model = d_model - self.num_modalities = num_modalities - self.dilated_attns = nn.ModuleList( - [DilatedAttention(d_model, num_heads, dilation_rate, segment_size, dropout, casual) for _ in range(num_modalities)] - ) - self.cross_modality_attn = DilatedAttention(num_modalities * d_model, num_heads, dilation_rate, segment_size, dropout, casual) - - def forward(self, x): - modality_outputs = [] - for modality_data, attn in zip(x, self.dilated_attns): - modality_outputs.append(attn(modality_data)) - - cross_modality_input = torch.cat(modality_outputs, dim=-1) - cross_modality_output = self.cross_modality_attn_(cross_modality_input) - - return cross_modality_output \ No newline at end of file diff --git a/LongNet/iterations/sparse/README.md b/LongNet/iterations/sparse/README.md deleted file mode 100644 index 308e8b8..0000000 --- a/LongNet/iterations/sparse/README.md +++ /dev/null @@ -1,80 +0,0 @@ -# Sparsifying even further -Making the attention mechanism more sparse could potentially improve the performance of the model in handling long sequences, as it will reduce the computational complexity. Here are three possible approaches: - -**1. Top-k Attention:** -For each query, instead of calculating attention scores with all key-value pairs, we can select the top-k scoring key-value pairs to calculate the attention output. - -Psuedocode: -``` -for each query in Q: - calculate attention scores with all keys in K - select top-k scoring keys and their corresponding values - calculate the attention output only with selected top-k keys-values pairs -``` - -**2. Block-based Attention:** -Split the input into several blocks, and for each query in a block, we only calculate attention scores with keys in the same block or neighboring blocks. - -Psuedocode: -``` -split input into blocks of size b -for each query in block_i: - calculate attention scores with keys in block_i, block_{i-1} and block_{i+1} - calculate the attention output -``` - -**3. Locality-sensitive hashing (LSH) based Attention:** -LSH can reduce the complexity of the attention mechanism from quadratic to linear. It works by hashing the queries and keys into several buckets, and for each query, it only needs to calculate attention scores with keys in the same bucket. - -Psuedocode: -``` -hash queries and keys into several buckets using LSH -for each query in bucket_i: - calculate attention scores with keys in the same bucket - calculate the attention output -``` - -Now, here are the implementations of the above methods. Please note that to use these in your existing `DilatedAttention` model, you may need to integrate these into your `forward` method. - -**1. Top-k Attention:** -```python -class TopKAttention(nn.Module): - def __init__(self, k): - super(TopKAttention, self).__init__() - self.k = k - - def forward(self, Q, K, V): - attention_scores = torch.matmul(Q, K.transpose(-2, -1)) # calculate attention scores - top_k_scores, top_k_indices = torch.topk(attention_scores, self.k, dim=-1) # select top-k scores and their indices - top_k_scores = torch.nn.functional.softmax(top_k_scores, dim=-1) # apply softmax to get attention weights - top_k_values = torch.gather(V, -2, top_k_indices) # gather corresponding values - output = torch.matmul(top_k_scores, top_k_values) # calculate attention output - - return output -``` - -**2. Block-based Attention:** -```python -class BlockAttention(nn.Module): - def __init__(self, block_size): - super(BlockAttention, self).__init__() - self.block_size = block_size - - def forward(self, Q, K, V): - num_blocks = Q.size(-2) // self.block_size - output = [] - for i in range(num_blocks): - q = Q[:, i*self.block_size:(i+1)*self.block_size, :] - k = K[:, max(0, i-1)*self.block_size:min(num_blocks, i+2)*self.block_size, :] - v = V[:, max(0, i-1)*self.block_size:min(num_blocks, i+2)*self.block_size, :] - attention_scores = torch.matmul(q, k.transpose(-2, -1)) - attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1) - attention_output = torch.matmul(attention_weights, v) - output.append(attention_output) - - output = torch.cat(output, dim=-2) - return output -``` - -**3. LSH-based Attention:** -Implementing LSH-based attention can be complex as it requires a suitable hashing function. As such, I'd recommend using existing implementations like [LSHAttention](https://huggingface.co/transformers/main_classes/model.html#transformers.LSHSelfAttention) from the Hugging Face's Transformers library. This layer applies locality-sensitive hashing (LSH) to enable long-range sequence attention with linear time and memory complexity. \ No newline at end of file diff --git a/LongNet/iterations/topk.py b/LongNet/iterations/topk.py deleted file mode 100644 index 2bd41d4..0000000 --- a/LongNet/iterations/topk.py +++ /dev/null @@ -1,157 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -from LongNet.utils import XPOS, RelativePositionBias -from LongNet.attention import FlashMHA - - -# Replace this with your correct GPU device -device = "cuda:0" -dtype=torch.float16 - - -class TopKAttention(nn.Module): - def __init__(self, k): - super(TopKAttention, self).__init__() - self.k = k - - def forward(self, Q, K, V): - attention_scores = torch.matmul(Q, K.transpose(-2, -1)) # calculate attention scores - top_k_scores, top_k_indices = torch.topk(attention_scores, self.k, dim=-1) # select topk scores and their indices - top_k_scores = torch.nn.functional.softmax(top_k_scores, dim=-1) # apply softmax to get attention weights - top_k_values = torch.gather(V, -2, top_k_indices) # gather corresponding values - output = torch.matmul(top_k_scores, top_k_values) # calculate attention output - - -# Define the attention module -class DilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, top_k, segment_size, dropout=0.0, casual=False, use_xpos=False, use_rel_pos_bias=False): - super(DilatedAttention, self).__init__() - - # Initialize parameters - self.d_model = d_model # model dimension - self.num_heads = num_heads # number of attention heads - self.dilation_rate = dilation_rate # dilation rate - self.segment_size = segment_size # segment size - self.top_k_attention = TopKAttention(top_k) - - # Initialize attention for each head with dilation - self.attentions = nn.ModuleList([FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype) for _ in range(self.dilation_rate)]) - - # Initialize dropout layer - self.dropout = nn.Dropout(dropout) - - # If casual attention is used - self.casual = casual - - # If using positional encoding - self.use_xpos = use_xpos - - # If using relative positional bias - self.use_rel_pos_bias = use_rel_pos_bias - - # If using positional encoding, initialize it - if use_xpos: - self.xpos = XPOS(head_dim=d_model//num_heads) - - # If using relative positional bias, initialize it - if use_rel_pos_bias: - self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads) - - # Initialize softmax for later use in weights - self.softmax = nn.Softmax(dim=-1) - - # Function to get mask for casual attention - def get_mask(self, i, j): - return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2) - - # Forward function - def forward(self, x): - # Get batch size, sequence length and model dimension - batch_size, seq_len, _ = x.shape - - # If using positional encoding, add it - if self.use_xpos: - x = self.xpos(x) - - # Initialize list to store outputs from each attention head - all_head_outputs = [] - - # For each attention head - for head_idx, attention in enumerate(self.attentions): - # Calculate offset for this head - offset = head_idx % self.dilation_rate - - # Apply offset and segment for this head - x_ = x[:, offset::self.dilation_rate, :] - x_ = x_.contiguous().view(batch_size, -1, self.segment_size, self.d_model) - - # Pass through attention - attn_output, _ = attention(x_, x_, x_) - - # If using relative positional bias, add it - if self.use_rel_pos_bias: - attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1)) - - # If using casual attention, apply mask - if self.casual: - mask = self.get_mask(attn_output.size(1), attn_output.size(1)) - attn_output = attn_output.masked_fill(mask, float('-inf')) - - # Apply dropout - attn_output = self.dropout(attn_output) - - # Resize back to original size - attn_output_resized = torch.zeros((batch_size, seq_len, self.d_model), device=device, dtype=dtype) - attn_output_resized[:, offset::self.dilation_rate, :] = attn_output.contiguous().view(batch_size, -1, self.d_model) - - # Append output to list of all outputs - all_head_outputs.append(attn_output_resized) - - # Calculate the weights for the different dilated attentions - weights = self.softmax(torch.tensor([1.0 / self.dilation_rate for _ in range(self.dilation_rate)], device=device, dtype=dtype)) - - # Apply the weights to the outputs of the different heads - outputs_weighted = sum(w * out for w, out in zip(weights, all_head_outputs)) - - # Return the weighted outputs - return outputs_weighted - - - - - - - - -class LongNetTransformer(nn.Module): - def __init__(self, d_model, num_heads, dilation_rates, segment_sizes): - super(LongNetTransformer, self).__init__() - assert len(dilation_rates) == len(segment_sizes), "dilation_rates and segment_sizes should have the same length" - - - self.d_model = d_model - self.num_heads = num_heads - self.dilation_rates = dilation_rates - self.segment_sizes = segment_sizes - - self.dilated_attention_layers = nn.ModuleList( - [DilatedAttention(d_model, num_heads, dilation_rate, segment_size)] - for dilation_rate, segment_size in zip(dilation_rates, segment_sizes) - ) - - def forward(self, x): - #accumlate outputs from different layers - outputs = [] - - #process each dilated attention layer - for i in range(len(self.dilated_attention_layers)): - output = self.dilated_attention_layers[i](x) - outputs.append(output) - - #combine the outputs - output = torch.sum(torch.stack(outputs), dim=0) - - return output - \ No newline at end of file diff --git a/LongNet/model.py b/LongNet/model.py deleted file mode 100644 index 4ee34c5..0000000 --- a/LongNet/model.py +++ /dev/null @@ -1,70 +0,0 @@ -import torch -from torch.nn import Embedding, Module -import bitsandbytes - -from transformers import AutoTokenizer - - -from torchscale.architecture.config import DecoderConfig -from torchscale.architecture.decoder import Decoder -from torchscale.component.embedding import PositionalEmbedding - - -class LongNetTokenizer: - def __init__(self): - self.tokenizer = AutoTokenizer.from_pretrained( - "EleutherAI/gpt-neox-20b", - eos_token="", - pad_token="", - extra_ids=0, - model_max_length=8192 - ) - - def tokenize_texts(self, texts): - return self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True).input_ids - - -class LongNet(Module): - def __init__(self): - super().__init__() - self.embed = bitsandbytes.nn.modules.Embedding( - 320002, - 2048, - padding_idx=1 - ) - - self.embed_positions = PositionalEmbedding( - 2048, - 2048, - 1 - ) - - self.output_projection = torch.nn.Linear( - 2048, 32002, bias=False - ) - - self.config = DecoderConfig( - decoder_layers=24, - decoder_embed_dim=2048, - decoder_ffn_embed_dim=8192, - decoder_attention_heads=32, - dropout=0.1, - activation_fn="gelu", - attention_dropout=0.1, - decoder_dilation_rate=4, - decoder_segment_size=2, - vocab_size=64007, - ) - - self.decoder = Decoder( - self.config, - embed_tokens=self.embed, - embed_positions=self.embed_positions, - output_projection=self.output_projection - ) - - - def forward(self, text_tokens, **kwargs): - model_input = self.decoder.forward_embedding(text_tokens)[0] - return self.decoder(model_input, passed_x=model_input)[0] - \ No newline at end of file diff --git a/LongNet/torchscale b/LongNet/torchscale deleted file mode 160000 index 2b10135..0000000 --- a/LongNet/torchscale +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2b101355d79dc48b8cdf4bc58a94f98be69f182a diff --git a/LongNet/training.py b/LongNet/training.py deleted file mode 100644 index 31c4bdd..0000000 --- a/LongNet/training.py +++ /dev/null @@ -1,650 +0,0 @@ -import math -import multiprocessing -import os -from datetime import timedelta -from functools import partial -from itertools import chain - -import torch -from torch.distributed.fsdp import ( - FullyShardedDataParallel, - MixedPrecision, - BackwardPrefetch, - ShardingStrategy, -) -from accelerate import Accelerator -from accelerate.utils import (DummyOptim, DummyScheduler, - InitProcessGroupKwargs) -from datasets import concatenate_datasets, load_dataset -from lion_pytorch import Lion -from torch.nn import LayerNorm - -from torch.nn import LayerNorm - -from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( - CheckpointImpl, apply_activation_checkpointing, checkpoint_wrapper) - -from torch.distributed.fsdp.wrap import ( - transformer_auto_wrap_policy, -) - - -from torch.optim import AdamW -from torch.utils.data import DataLoader -from tqdm import tqdm -from transformers import (AutoTokenizer, default_data_collator, - get_cosine_schedule_with_warmup, - get_linear_schedule_with_warmup, set_seed) - - - -# INTEGRATE LONGNET selector + stable8bitfusedadam - -# from LongNet.torchscale.torchscale.architecture.decoder import Decoder -from torchscale import Decoder -from utils import StableAdamWUnfused -from model import LongNet -############ SETUP CONFIG -# import torch.distributed as dist - -# dist.init_process_group(backend='nccl', init_method="env://") - -################ - -class CFG: - BATCH_SIZE: int = 3 - GRADIENT_ACCUMULATE_EVERY: int = 1 - SEED: int = 42 - LEARNING_RATE: float = 3e-4 - WEIGHT_DECAY: float = 0.1 - SEQ_LEN: int = 8192 - NUM_CPU: int = multiprocessing.cpu_count() - USE_DEEPSPEED: bool = True - USE_FSDP: bool = False - USE_PRETOKENIZED: bool = False - USE_ACTIVATION_CHECKPOINTING: bool = False - RESUME_FROM_CHECKPOINT: str = None - CHECKPOINTING_STEPS: int = 1000 - OUTPUT_DIR: str = "YOUR_OUTPUT_DIR" - ENTITY_NAME: str = "YOUR_ENTITY_NAME" #wandb - - -# helpers - - -def print_num_params(model, accelerator: Accelerator): - n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - accelerator.print(f"Number of parameters in model: {n_params}") - - -# activation checkpointing - - -def activation_checkpointing( - model: torch.nn.Module, - offload_to_cpu: bool = False, - accelerator: Accelerator = None, -): - """ - Apply activation checkpointing to a model. - - Args: - model (Module): The model to which to apply activation checkpointing. - offload_to_cpu (bool, optional): Whether to offload the activations to CPU. Defaults to False. - accelerator (Accelerator, optional): The Accelerate library accelerator. Defaults to None. - """ - if accelerator is not None: - accelerator.print(f"Using activation checkpointing") - #maybe error here in decoder, use parallel transformer block - check_fn = lambda submodule: isinstance(submodule, Decoder) - non_reentrant_wrapper = partial( - checkpoint_wrapper, - offload_to_cpu=offload_to_cpu, - checkpoint_impl=CheckpointImpl.NO_REENTRANT, - ) - apply_activation_checkpointing( - model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn - ) - - -# FSDP - - -def fsdp( - model: torch.nn.Module, - auto_wrap: bool = False, - mp: str = "fp32", - shard_strat: str = "NO_SHARD", -): - """ - This function wraps a given PyTorch model with the FullyShardedDataParallel (FSDP) wrapper to enable efficient data parallelism and model sharding. - - Args: - model (torch.nn.Module): The original PyTorch model to be wrapped with FSDP. - auto_wrap (bool, optional): If True, it enables automatic wrapping of the model's layers according to the transformer_auto_wrap_policy. Default is False. - mp (str, optional): The mixed precision mode to be used. Can be 'bf16' for BFloat16, 'fp16' for Float16 or 'fp32' for Float32 precision. Default is 'fp32'. - shard_strat (str, optional): The sharding strategy to be used. Can be 'SHARD_GRAD' for sharding at gradient computation, 'FULL_SHARD' for full model sharding or 'NO_SHARD' for no sharding. Default is 'NO_SHARD'. - - Raises: - ValueError: If the provided mp (mixed precision mode) is not 'bf16', 'fp16' or 'fp32'. - ValueError: If the provided shard_strat (sharding strategy) is not 'SHARD_GRAD', 'FULL_SHARD' or 'NO_SHARD'. - - Returns: - torch.nn.Module: The input model wrapped with FSDP. - """ - if auto_wrap: - LongNet_auto_wrap_policy = partial( - transformer_auto_wrap_policy, - transformer_layer_cls={ - Decoder, - }, - ) - else: - LongNet_auto_wrap_policy = None - - if mp == "bf16": - mp_fsdp = MixedPrecision( - param_dtype=torch.bfloat16, - # Gradient communication precision. - reduce_dtype=torch.bfloat16, - # Buffer precision. - buffer_dtype=torch.bfloat16, - ) - elif mp == "fp16": - mp_fsdp = MixedPrecision( - param_dtype=torch.float16, - # Gradient communication precision. - reduce_dtype=torch.float16, - # Buffer precision. - buffer_dtype=torch.float16, - ) - elif mp == "fp32": - mp_fsdp = MixedPrecision( - param_dtype=torch.float32, - # Gradient communication precision. - reduce_dtype=torch.float32, - # Buffer precision. - buffer_dtype=torch.float32, - ) - else: - raise ValueError( - "Invalid scheduler_type. Expected 'bf16', 'fp16' or 'fp32', got: {}".format( - mp - ) - ) - - if shard_strat == "SHARD_GRAD": - sharding_strat_fsdp = ShardingStrategy.SHARD_GRAD_OP - elif shard_strat == "FULL_SHARD": - sharding_strat_fsdp = ShardingStrategy.FULL_SHARD - elif shard_strat == "NO_SHARD": - sharding_strat_fsdp = ShardingStrategy.NO_SHARD - else: - raise ValueError( - "Invalid scheduler_type. Expected 'SHARD_GRAD', 'FULL_SHARD' or 'NO_SHARD', got: {}".format( - shard_strat - ) - ) - - model = FullyShardedDataParallel( - model, - auto_wrap_policy=LongNet_auto_wrap_policy, - mixed_precision=mp_fsdp, - backward_prefetch=BackwardPrefetch.BACKWARD_PRE, - sharding_strategy=sharding_strat_fsdp, - forward_prefetch=True, - use_orig_params=True, - ) - - return model - - -# learning rate scheduler - - -def get_lr_scheduler_with_warmup( - optimizer: torch.optim.Optimizer, - scheduler_type: str, - num_warmup_steps: int, - max_train_steps: int, - grad_accumulate_every: int = 1, - accelerator: Accelerator = None, -): - """ - Get a learning rate scheduler with warmup. - - Args: - optimizer (Optimizer): The optimizer for which to create the learning rate scheduler. - scheduler_type (str): The type of learning rate scheduler to create, either "linear" or "cosine". - num_warmup_steps (int): The number of warmup steps for the learning rate scheduler. - max_train_steps (int): The maximum number of training steps. - grad_accumulate_every (int, optional): The gradient accumulation factor. Defaults to 1. - accelerator (Accelerator, optional): The Accelerate library accelerator. Defaults to None. - - Returns: - The learning rate scheduler with warmup. - - Raises: - ValueError: If scheduler_type is not "linear" or "cosine". - """ - NUM_WARMUP_STEPS = num_warmup_steps - GRADIENT_ACCUMULATE_EVERY = grad_accumulate_every - if accelerator is not None: - accelerator.print(f"Using {scheduler_type} lr scheduler") - if scheduler_type == "linear": - return get_linear_schedule_with_warmup( - optimizer=optimizer, - num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY, - num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY, - ) - elif scheduler_type == "cosine": - return get_cosine_schedule_with_warmup( - optimizer=optimizer, - num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY, - num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY, - ) - else: - raise ValueError( - "Invalid scheduler_type. Expected 'linear' or 'cosine', got: {}".format( - scheduler_type - ) - ) - - -# optimizers - - -def decoupled_optimizer( - model: torch.nn.Module, - learning_rate: float, - weight_decay: float, - beta_1: float, - beta_2: float, - optimizer_type: str, - use_fsdp: bool = True, - accelerator: Accelerator = None, -): - """ - Decouples the optimizer from the training process. - - This function sets up the optimizer for the model by creating two groups of parameters: - one for weight decay and one without weight decay. Then, it initializes the optimizer - with these two groups of parameters. - - Args: - model (Module): The model whose parameters are optimized. - learning_rate (float): The learning rate for the optimizer. - weight_decay (float): The weight decay for the optimizer. - beta_1 (float): The exponential decay rate for the 1st moment estimates. - beta_2 (float): The exponential decay rate for the 2nd moment estimates. - optimizer_type (str): The type of the optimizer. Can be 'lion', 'adamw', or 'stable_adamw'. - use_fsdp (bool, optional): If True, the optimizer will work with fully sharded data parallelism. Defaults to True. - accelerator (Accelerator, optional): The accelerator from HuggingFace's Accelerate library. Defaults to None. - - Returns: - Optimizer: The initialized optimizer. - - Raises: - ValueError: If the optimizer type is not 'lion', 'adamw' or 'stable_adamw'. - """ - accelerator.print(f"Using {optimizer_type} optimizer") - # Create an empty dictionary called param_dict to store the model's named parameters. - param_dict = {} - # Iterate over the model's named parameters and populate the param_dict with key-value pairs. - for param_name, param in model.named_parameters(): - param_dict[param_name] = param - - # Separate the model's named modules into two groups: decay and no_decay. - - # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay. - no_decay = [] - - if use_fsdp: - exclude_module = "_fsdp_wrapped_module.token_emb" - else: - exclude_module = "token_emb" - - # Iterate through the named modules of the model. - for module_name, module in model.named_modules(): - # Check if the current module is an instance of any of the desired types (LayerNorm or torch.nn.Embedding). - for ndim in [LayerNorm, torch.nn.Embedding]: - if isinstance(module, ndim): - # If torch.nn.Embedding, append its name with a ".weight" suffix to the no_decay list. - if module_name == exclude_module: - no_decay.append(f"{module_name}.weight") - else: - # If the module is an instance of LayerNorm - no_decay.append(f"{module_name}.gamma") - # Exit the inner loop since the desired module has been found. - break - - # Create an empty list to store the names of the Linear layer weights with weight decay. - decay = [] - - # Iterate through the named modules of the model. - for module_name, module in model.named_modules(): - # Check if the current module is an instance of the desired type (torch.nn.Linear). - for ndim in [torch.nn.Linear]: - if isinstance(module, ndim): - # If the module is an instance of torch.nn.Linear, append its name with a ".weight" suffix to the decay list. - decay.append(f"{module_name}.weight") - # Exit the inner loop since the desired module has been found. - break - - # Create two separate lists of model parameters: decay_param and no_decay_param. - # The decay_param list contains the parameters that should have weight decay applied. - # The no_decay_param list contains the parameters that should not have weight decay applied, excluding the 'to_logits.weight' parameter. - - # Create an empty list called decay_param to store the parameters with weight decay. - decay_param = [] - - if use_fsdp: - exclude_param = "_fsdp_wrapped_module.to_logits.weight" - else: - exclude_param = "to_logits.weight" - - # Iterate over the decay list, which contains the names of the parameters with weight decay. - for param in decay: - # Check if the current parameter is not 'to_logits.weight'. - # Append the corresponding parameter from param_dict to the decay_param list. - - if param != exclude_param: - decay_param.append(param_dict[param]) - - # Create an empty list called no_decay_param to store the parameters without weight decay. - no_decay_param = [] - - # Iterate over the no_decay list, which contains the names of the parameters without weight decay. - for param in no_decay: - # Append the corresponding parameter from param_dict to the no_decay_param list. - no_decay_param.append(param_dict[param]) - - # Create a list called grouped_params that contains two dictionaries. - # The first dictionary has the decay_param list and the corresponding weight_decay value. - # The second dictionary has the no_decay_param list and a weight_decay value of 0.0. - grouped_params = [ - {"params": decay_param, "weight_decay": weight_decay}, - {"params": no_decay_param, "weight_decay": 0.0}, - ] - - # Create a variable called optimizer that stores an instance of the optimizer. - if optimizer_type == "lion": - optimizer = Lion(grouped_params, lr=learning_rate, betas=(beta_1, beta_2),) - elif optimizer_type == "adamw": - optimizer = AdamW(grouped_params, lr=learning_rate, betas=(beta_1, beta_2),) - elif optimizer_type == "deepspeed": - optimizer = DummyOptim(grouped_params, lr=learning_rate, betas=(beta_1, beta_2),) - elif optimizer_type == "stable_adamw": - optimizer = StableAdamWUnfused( - grouped_params, lr=learning_rate, betas=(beta_1, beta_2), - ) - else: - raise ValueError( - "Invalid optimizer_type. Expected 'lion', 'adamw', 'deepspeed' or 'stable_adamw', got: {}".format( - optimizer_type - ) - ) - - # Return the optimizer. - return optimizer - - -# dataloaders - - -def build_dataloaders(): - """ - Build data loaders for training. - - This function performs the following steps: - 1. Load the tokenizer from the pretrained "EleutherAI/gpt-neox-20b" model. - 2. Load the "openwebtext" dataset. - 3. Tokenize the dataset, adding the end-of-sentence token to each text. - 4. Process the tokenized dataset into chunks of a specified block size. - - Returns: - Dataset: The processed dataset ready for training. - """ - tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") - dataset = load_dataset("openwebtext", split="train") - - tokenized_dataset = dataset.map( - lambda example: tokenizer([t + tokenizer.eos_token for t in example["text"]]), - batched=True, - num_proc=CFG.NUM_CPU, - remove_columns=["text"], - ) - - block_size = CFG.SEQ_LEN - - # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - return result - - train_dataset = tokenized_dataset.map( - group_texts, batched=True, num_proc=CFG.NUM_CPU, - ) - - return train_dataset - -#switch to falconwebdataset -def build_pre_tokenized(): - d0 = load_dataset("conceptofmind/c4_0-to-20_neox_with_eos_8k", split="train") - d1 = load_dataset("conceptofmind/c4_21-to-40_neox_with_eos_8k", split="train") - d2 = load_dataset("conceptofmind/c4_41-to-60_neox_with_eos_8k", split="train") - d3 = load_dataset("conceptofmind/c4_61-to-80_neox_with_eos_8k", split="train") - d4 = load_dataset("conceptofmind/c4_81-to-100_neox_with_eos_8k", split="train") - train_dataset = concatenate_datasets([d0, d1, d2, d3, d4]) - return train_dataset - - - -def Train(): - # accelerator - - timeout = InitProcessGroupKwargs(timeout=timedelta(seconds=1_000_000)) - - accelerator = Accelerator( - gradient_accumulation_steps=CFG.GRADIENT_ACCUMULATE_EVERY, - mixed_precision="fp16", - log_with="wandb", - kwargs_handlers=[timeout], - ) - # AcceleratorState().deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = 4 #?????? - - - accelerator.init_trackers( - project_name="LongNet", - config={ - "batch_size": CFG.BATCH_SIZE, - "gradient_accumulate_every": CFG.GRADIENT_ACCUMULATE_EVERY, - "learning_rate": CFG.LEARNING_RATE, - "seq_len": CFG.SEQ_LEN, - }, - init_kwargs={"wandb": {"entity": CFG.ENTITY_NAME}}, - ) - - accelerator.print(f"Total GPUS: {accelerator.num_processes}") - - # set seed - - set_seed(CFG.SEED) - - - # model = LongNet.to(accelerator.device) - # model = AutoModelForCausalLM.from_pretrained("YOUR MODEL", load_in_4bit=True, device_map="auto").to(accelerator.device) - LongNet().to(accelerator.device) - - print_num_params(model, accelerator) - - if CFG.USE_FSDP: - model = fsdp( - model, - mp="fp16", - shard_strat="SHARD_GRAD" - ) - - if CFG.USE_ACTIVATION_CHECKPOINTING: - activation_checkpointing(model, accelerator) - - model = accelerator.prepare(model) - - # dataloaders - - if CFG.USE_PRETOKENIZED: - train_dataset = build_pre_tokenized() - else: - train_dataset = build_dataloaders() - - train_loader = DataLoader( - train_dataset, batch_size=CFG.BATCH_SIZE, collate_fn=default_data_collator, - ) - - # optimizer - - optim = decoupled_optimizer( - model=model, - learning_rate=CFG.LEARNING_RATE, - weight_decay=CFG.WEIGHT_DECAY, - beta_1=0.90, - beta_2=0.95, - optimizer_type='deepspeed', - use_fsdp=True, - accelerator=accelerator - ) - - # Determine number of training steps - - max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY) - accelerator.print(f"Max train steps: {max_train_steps}") - - # lr scheduler - - NUM_WARMUP_STEPS = int(max_train_steps * 0.01) - accelerator.print(f"Num warmup steps: {NUM_WARMUP_STEPS}") - - if CFG.USE_DEEPSPEED: - lr_scheduler = DummyScheduler( - optim, - total_num_steps=max_train_steps * accelerator.num_processes, - warmup_num_steps=NUM_WARMUP_STEPS - ) - else: - lr_scheduler = get_lr_scheduler_with_warmup( - optimizer=optim, - scheduler_type="cosine", - num_warmup_steps=NUM_WARMUP_STEPS, - max_train_steps=max_train_steps, - grad_accumulate_every=CFG.GRADIENT_ACCUMULATE_EVERY, - ) - - # prepare - - optim, train_loader, lr_scheduler = accelerator.prepare( - optim, train_loader, lr_scheduler - ) - - # checkpoint scheduler - - accelerator.register_for_checkpointing(lr_scheduler) - - # I do not know why Huggingface recommends recalculation of max_train_steps - - max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY) - accelerator.print(f"Max train steps recalculated: {max_train_steps}") - - # Total batch size for logging - - total_batch_size = ( - CFG.BATCH_SIZE * accelerator.num_processes * CFG.GRADIENT_ACCUMULATE_EVERY - ) - accelerator.print(f"Total batch size: {total_batch_size}") - - # resume training - - progress_bar = tqdm( - range(max_train_steps), disable=not accelerator.is_local_main_process - ) - completed_steps = 0 - - if CFG.RESUME_FROM_CHECKPOINT: - if CFG.RESUME_FROM_CHECKPOINT is not None or CFG.RESUME_FROM_CHECKPOINT != "": - accelerator.print(f"Resuming from checkpoint {CFG.RESUME_FROM_CHECKPOINT}") - accelerator.load_state(CFG.RESUME_FROM_CHECKPOINT) - path = os.path.basename(CFG.RESUME_FROM_CHECKPOINT) - training_difference = os.path.splitext(path)[0] - - # need to multiply `gradient_accumulation_steps` to reflect real steps - resume_step = ( - int(training_difference.replace("step_", "")) - * CFG.GRADIENT_ACCUMULATE_EVERY - ) - - if CFG.RESUME_FROM_CHECKPOINT and resume_step is not None: - train_loader = accelerator.skip_first_batches(train_loader, resume_step) - completed_steps += resume_step - progress_bar.update(resume_step) - - # training - - model.train() - for step, batch in enumerate(train_loader): - with accelerator.accumulate(model): - inputs = batch["input_ids"].to(accelerator.device) - loss = model(inputs, return_loss=True) - accelerator.backward(loss) - - accelerator.log({"loss": loss.item()}, step=step) - - if accelerator.sync_gradients: - accelerator.clip_grad_norm_(model.parameters(), 1.0) - - optim.step() - lr_scheduler.step() - optim.zero_grad() - - if accelerator.sync_gradients: - progress_bar.update(1) - completed_steps += 1 - - if isinstance(CFG.CHECKPOINTING_STEPS, int): - if completed_steps % CFG.CHECKPOINTING_STEPS == 0: - output_dir = f"step_{completed_steps }" - if CFG.OUTPUT_DIR is not None: - output_dir = os.path.join(CFG.OUTPUT_DIR, output_dir) - accelerator.save_state(output_dir) - - if completed_steps >= max_train_steps: - break - - # end training - - # accelerator.print(f"Training Finished") - accelerator.end_training() - - # save final model - - # accelerator.print(f"Saving model to {CFG.OUTPUT_DIR}") - if CFG.OUTPUT_DIR is not None: - accelerator.wait_for_everyone() - unwrapped_model = accelerator.unwrap_model(model) - with accelerator.main_process_first(): - accelerator.save( - unwrapped_model.state_dict(), f"{CFG.OUTPUT_DIR}/final/final_model.pt" - ) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/README.md b/README.md index a45aacf..600601c 100644 --- a/README.md +++ b/README.md @@ -1,321 +1,103 @@ -# Agora -This implementation of LongNet is brought to you by Agora, we're an all-new open source AI research organization with 1,500+ AI researchers all striving to advance Humanity! +[![Multi-Modality](images/agorabanner.png)](https://discord.gg/qUtxnK2NMf) -![Agora banner](agora-banner-water.png) - -[Join us and help contribute to LongNet and or recieve FAST support in the Agora discord!](https://discord.gg/qUtxnK2NMf) # LongNet: Scaling Transformers to 1,000,000,000 Tokens +![LongNetBanner](images/longnet.jpg) -This is an open source implementation for the paper [LongNet: Scaling Transformers to 1,000,000,000 Tokens](https://arxiv.org/abs/2307.02486) by Jiayu Ding, Shuming Ma, Li Dong, Xingxing Zhang, Shaohan Huang, Wenhui Wang, Furu Wei. The LongNet is a Transformer variant designed to scale sequence length up to more than 1 billion tokens without sacrificing performance on shorter sequences. -## Introduction +[![GitHub issues](https://img.shields.io/github/issues/kyegomez/LongNet)](https://github.com/kyegomez/LongNet/issues) +[![GitHub forks](https://img.shields.io/github/forks/kyegomez/LongNet)](https://github.com/kyegomez/LongNet/network) +[![GitHub stars](https://img.shields.io/github/stars/kyegomez/LongNet)](https://github.com/kyegomez/LongNet/stargazers) [![GitHub license](https://img.shields.io/github/license/kyegomez/LongNet)](https://github.com/kyegomez/LongNet/blob/master/LICENSE) +[![Share on Twitter](https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Share%20%40kyegomez/LongNet)](https://twitter.com/intent/tweet?text=Excited%20to%20introduce%20LongNet,%20the%20all-new%20LongSequence%20model%20with%20the%20potential%20to%20revolutionize%20automation.%20Join%20us%20on%20this%20journey%20towards%20a%20smarter%20future.%20%23LongNet%20%23LongSequence&url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet) +[![Share on Facebook](https://img.shields.io/badge/Share-%20facebook-blue)](https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet) +[![Share on LinkedIn](https://img.shields.io/badge/Share-%20linkedin-blue)](https://www.linkedin.com/shareArticle?mini=true&url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet&title=Introducing%20LongNet%2C%20the%20All-New%20LongSequence%20Model&summary=LongNet%20is%20the%20next-generation%20LongSequence%20model%20that%20promises%20to%20transform%20industries%20with%20its%20intelligence%20and%20efficiency.%20Join%20us%20to%20be%20a%20part%20of%20this%20revolutionary%20journey%20%23LongNet%20%23LongSequence&source=) +![Discord](https://img.shields.io/discord/999382051935506503) +[![Share on Reddit](https://img.shields.io/badge/-Share%20on%20Reddit-orange)](https://www.reddit.com/submit?url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet&title=Exciting%20Times%20Ahead%20with%20LongNet%2C%20the%20All-New%20LongSequence%20Model%20%23LongNet%20%23LongSequence) [![Share on Hacker News](https://img.shields.io/badge/-Share%20on%20Hacker%20News-orange)](https://news.ycombinator.com/submitlink?u=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet&t=Exciting%20Times%20Ahead%20with%20LongNet%2C%20the%20All-New%20LongSequence%20Model%20%23LongNet%20%23LongSequence) +[![Share on Pinterest](https://img.shields.io/badge/-Share%20on%20Pinterest-red)](https://pinterest.com/pin/create/button/?url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet&media=https%3A%2F%2Fexample.com%2Fimage.jpg&description=LongNet%2C%20the%20Revolutionary%20LongSequence%20Model%20that%20will%20Change%20the%20Way%20We%20Work%20%23LongNet%20%23LongSequence) +[![Share on WhatsApp](https://img.shields.io/badge/-Share%20on%20WhatsApp-green)](https://api.whatsapp.com/send?text=I%20just%20discovered%20LongNet,%20the%20all-new%20LongSequence%20model%20that%20promises%20to%20revolutionize%20automation.%20Join%20me%20on%20this%20exciting%20journey%20towards%20a%20smarter%20future.%20%23LongNet%20%23LongSequence%0A%0Ahttps%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet) -Scaling sequence length has become a critical bottleneck in the era of large language models. However, existing methods struggle with either computational complexity or model expressivity, rendering the maximum sequence length restricted. In this paper, they introduce LongNet, a Transformer variant that can scale sequence length to more than 1 billion tokens, without sacrificing the performance on shorter sequences. Specifically, they propose dilated attention, which expands the attentive field exponentially as the distance grows. -## Features -LongNet has significant advantages: -1. It has a linear computation complexity and a logarithm dependency between tokens. -2. It can be served as a distributed trainer for extremely long sequences. -3. Its dilated attention is a drop-in replacement for standard attention, which can be seamlessly integrated with the existing Transformer-based optimization. -Experiment results demonstrate that LongNet yields strong performance on both long-sequence modeling and general language tasks. Their work opens up new possibilities for modeling very long sequences, e.g., treating a whole corpus or even the entire Internet as a sequence. +This is an open source implementation for the paper [LongNet: Scaling Transformers to 1,000,000,000 Tokens](https://arxiv.org/abs/2307.02486) by Jiayu Ding, Shuming Ma, Li Dong, Xingxing Zhang, Shaohan Huang, Wenhui Wang, Furu Wei. The LongNet is a Transformer variant designed to scale sequence length up to more than 1 billion tokens without sacrificing performance on shorter sequences. -Here's the updated usage and installation section with two methods: git clone or pip install LongNet: ## Installation -c -You can install LongNet using one of the following methods: - -### Method 1: Git Clone - -1. Clone the LongNet repository from GitHub: - -```shell -git clone https://github.com/kyegomez/LongNet.git -``` - -2. Navigate to the cloned directory: ```shell -cd LongNet +pip install longnet ``` -3. Prepare `flash_attn` library - -```bash - -cd flash_attn - -python setup.py install - -cd .. - -``` - -4. Install the required dependencies: - -```shell -pip install -r requirements.txt -``` - - -### Method 2: Pip Install -* Note that pip install does not work as the `flash-attn` library cannot be compiled since it has custom CUDA Kernels and they need to be built manually. - -1. Install LongNet directly from PyPI using pip: - -```shell -pip install LongNet -``` - -Please note that LongNet requires a compatible Python version (tested with Python 3.7). - ## Usage Once you have installed LongNet, you can use the `DilatedAttention` class as follows: ```python import torch -import torch.nn as nn -from LongNet import DilatedAttention +from long_net import DilatedAttention -# Replace this with your correct GPU device -device = "cuda:0" -dtype = torch.float16 -# Create an instance of DilatedAttention -d_model = 512 -num_heads = 8 +# model config +dim = 512 +heads = 8 dilation_rate = 2 segment_size = 64 -dropout = 0.2 # Specify the dropout rate -attention = DilatedAttention( - d_model=d_model, - num_heads=num_heads, - dilation_rate=dilation_rate, - segment_size=segment_size, - dropout=dropout, -).to(device, dtype=dtype) - -# Create some dummy input data -batch_size = 16 -seq_len = 128 -input_dim = d_model -inputs = torch.randn(batch_size, seq_len, input_dim, device=device, dtype=dtype) - -# Forward pass -outputs = attention(inputs) - -# Print the output shape -print(outputs.shape) # Expected: [batch_size, seq_len, d_model] -``` - -# Training the Model -There are 2 methods, one is `accelerate` and the other `from LongNet import Train` - -### Method 1 - -* Git clone installation - -* Init your parameters `accelerate config` - -* Then `accelerate launch LongNet/training.py` - -# Method 2 - -* Pip install method - -```python - -from LongNet import Train - -Train() - -``` - -In the example above, we create an instance of the `DilatedAttention` class with the specified hyperparameters. We then generate some dummy input data and pass it through the attention mechanism to obtain the outputs. Finally, we print the shape of the output tensor. - - -# DilatedAttention Documentation - -## Algorithmic Psueodocode: -``` -1. Initialize the input (Q, K, V) and split them into segments {(Qei, Kei, Vei)} with equal segment length w. -2. Sparsify each segment along the sequence dimension by selecting the rows with an interval r. -3. Feed the sparsified segments into the attention in parallel. -4. Scatter and concatenate the output O from the attention. -5. Implement a mixture of dilated attentions with different segment sizes and dilation rates {ri, wi}. -6. For multi-head dilated attention, differ the computation among different heads by sparsifying different parts of the query-key-value pairs. -7. Concatenate the outputs of different heads into a final output. -``` - - -## Class Definition - -```python -class DilatedAttention(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, causal=False, use_xpos=False, use_rel_pos_bias=False ): - ... -``` - -## Parameters - -- `d_model` (int): The dimensionality of the model. This should match the dimension of the input to the layer. - -- `num_heads` (int): The number of attention heads to use in the `FlashMHA` attention mechanism. - -- `dilation_rate` (int): The dilation rate to use when processing the input sequence. Larger values will result in fewer, but wider, attention computations. -- `segment_size` (int): The size of the segments into which the input sequence is divided before dilating and computing attention. +# input data +batch_size = 32 +seq_len = 8192 -- `dropout` (float, optional): The dropout rate to apply to the attention outputs. Default is 0.0. -- `causal` (bool, optional): If True, a causal mask will be applied to the attention outputs, preventing any given position from attending to future positions. Default is False. +# create model and data +model = DilatedAttention(dim, heads, dilation_rate, segment_size, qk_norm=True) +x = torch.randn((batch_size, seq_len, dim)) -- `use_xpos` (optional): If set to True, xpos is used for positional encoding. Default: False +output = model(x) +print(output) -- `use_rel_pos_bias` (optional): If set to True, relative position bias is used in the attention mechanism. Default: False -## Usage - -### Creating an Instance - -First, you need to create an instance of the `DilatedAttention` class. Here is how you do it: - -```python -dilated_attn = DilatedAttention(d_model=512, num_heads=8, dilation_rate=2, segment_size=64, dropout=0.1, causal=True, use_xpos=False, use_rel_pos_bias=False) ``` -In this example, we're creating a `DilatedAttention` layer with a model dimensionality of 512, 8 attention heads, a dilation rate of 2, a segment size of 64, a dropout rate of 0.1, and causal masking enabled. - -### Forward Pass - -To perform a forward pass through the layer, simply call the instance as if it were a function, passing in your input tensor: +### `LongNetTransformer` +A fully ready to train transformer model with dilated transformer blocks with Feedforwards with layernorm, SWIGLU, and a parallel transformer block ```python import torch +from long_net.model import LongNetTransformer -# Assume x is your input tensor with shape (batch_size, sequence_length, d_model) -x = torch.rand(16, 1000, 512).to(device) - -output = dilated_attn(x) -``` - -In this example, the input tensor `x` has a batch size of 16, a sequence length of 1000, and a model dimensionality of 512. The output tensor will have the same shape as the input tensor. - -### Integration with Other Layers - -You can integrate the `DilatedAttention` layer into a larger model just like any other PyTorch layer. For example, here's how you might use it as part of a simple transformer-like model: +longnet = LongNetTransformer( + num_tokens=20000, + dim=512, + depth=6, + dim_head=64, + heads=8, + ff_mult=4, +) -```python -class SimpleTransformer(nn.Module): - def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout): - super().__init__() - - self.dilated_attn = DilatedAttention(d_model, num_heads, dilation_rate, segment_size, dropout, causal=True, use_xpos=False, use_rel_pos_bias=False) - self.fc = nn.Linear(d_model, 10) # Assume we're doing a 10-class classification task +tokens = torch.randint(0, 20000, (1, 512)) +logits = longnet(tokens) +print(logits) - def forward(self, x): - x = self.dilated_attn(x) - x = self.fc(x[:, 0]) # Use the first position output as the "CLS" token - return x -model = SimpleTransformer(d_model=512, num_heads=8, dilation_rate=2, segment_size=64, dropout=0.1) ``` -In this example, we first pass the input tensor through the `DilatedAttention` layer, then we pass the output of the first position through a fully-connected layer to perform a classification task. - - -## DilationAttention Overview - -`DilatedAttention` is a neural network architecture that incorporates attention mechanisms, specifically the multi-head attention, in a dilated manner. The main idea behind this architecture is to leverage the efficient attention calculation capabilities of the `FlashMHA` method, which is part of the `flash_attn` module, while also providing the ability to handle longer sequences with reduced computation through dilation. - -## Components - -The class `DilatedAttention` has the following primary components: - -- **FlashMHA attention**: A fast and efficient multi-head attention mechanism implemented using the `FlashMHA` method. This is the main attention computation method used in the architecture. - -- **Dilation**: Dilating the input sequences allows the model to handle longer sequences with fewer computations, making the architecture more scalable and efficient. - -- **Causal masking (optional)**: If the `causal` argument is set to `True`, a causal mask is applied to the attention outputs, ensuring that each output position only depends on earlier positions in the sequence. This feature is particularly useful when dealing with sequential data where future dependencies should not be considered. - -- **Dropout**: A dropout layer that can be configured to add regularization to the model and prevent overfitting. - -## How It Works - -The `DilatedAttention` model works in the following steps: - -1. **Input Reshape**: Reshapes the input into smaller segments based on the provided `segment_size` and then dilates it by selecting every `dilation_rate` segment. - -2. **Attention Computation**: Uses `FlashMHA` to compute the attention over the dilated segments. - -3. **Causal Masking**: If `causal` is set to `True`, a causal mask is applied to the attention output. This ensures that the output at each position in the sequence does not depend on any future positions. - -4. **Dropout**: Applies dropout to the attention outputs as a means of regularization. - -5. **Output Reshape**: Reshapes the output to match the original sequence length, concatenating the dilated segments. - -## Why It Works - -The `DilatedAttention` model achieves efficiency and scalability in several ways: +# Train +- To run a simple training run on the enwiki8 dataset, gitclone, install the requirements.txt, and then run `python3 train.py` -- **Efficient attention calculation**: The use of `FlashMHA` enables efficient and fast attention computation. +## LongNet Summarized -- **Dilation**: Dilation allows the model to handle longer sequences with reduced computation, effectively making the model more scalable. - -- **Causal masking**: By ensuring that each output position only depends on earlier positions in the sequence, the model becomes suitable for tasks involving sequential data. - -## Potential Optimizations - -1. **Parallelization**: Take advantage of the parallel processing capabilities of modern GPUs for the dilation and reshaping steps. - -2. **Memory optimization**: Efficient memory usage could be achieved through gradient checkpointing or activation pruning. - -3. **Pre-computation**: If some portions of the input data remain constant across multiple operations, pre-compute those portions and store the results for reuse. - -4. **Batch normalization**: Incorporating batch normalization layers could help to speed up the learning process and improve generalization. - -5. **Pruning and Quantization**: Pruning unnecessary connections and quantizing the model parameters can help in reducing the model's memory footprint and speed up computation without sacrificing much accuracy. - - - -## Share with Friends -Share LongNet with your friends and colleagues who might find it useful. Simply click on the links below to share on various platforms: - -- [Facebook](https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet) -- [Twitter](https://twitter.com/intent/tweet?url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet&text=Check%20out%20the%20LongNet%20repository%2C%20an%20implementation%20for%20scaling%20Transformers%20to%201%2C000%2C000%2C000%20tokens.%20%23LongNet%20%23Transformers) -- [LinkedIn](https://www.linkedin.com/shareArticle?url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet&title=LongNet%3A%20Scaling%20Transformers%20to%201%2C000%2C000%2C000%20Tokens) -- [Reddit](https://reddit.com/submit?url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet&title=LongNet%3A%20Scaling%20Transformers%20to%201%2C000%2C000%2C000%20Tokens) -- [WhatsApp](https://wa.me/?text=Check%20out%20the%20LongNet%20repository%2C%20an%20implementation%20for%20scaling%20Transformers%20to%201%2C000%2C000%2C000%20tokens%3A%20https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet) -- [Email](mailto:?subject=Check%20out%20the%20LongNet%20repository&body=Hey%2C%0A%0ACheck%20out%20the%20LongNet%20repository%2C%20an%20implementation%20for%20scaling%20Transformers%20to%201%2C000%2C000%2C000%20tokens%3A%0A%0Ahttps%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet%0A%0AEnjoy%21) -- [Hacker News](https://news.ycombinator.com/submitlink?u=https%3A%2F%2Fgithub.com%2Fkyegomez%2FLongNet&t=LongNet%3A%20Scaling%20Transformers%20to%201%2C000%2C000%2C000%20Tokens) - -Thank you for sharing! - -[Share LongNet Repository](https://github.com/kyegomez/LongNet) - - - - -# Roadmap - -* Test and evaluate and patch. - -* And, create an interation of `DilatedAttention` with `FlashBlocksparseMHA` - -* Create a multi-modal `DilationAttention` with multiway, sub layernorm, and xpos, sub layernorm, QK Layernorm, One write query head maybe - -* Integrate Alibi and xpos for even further ridicoulus length extrapolation +Scaling sequence length has become a critical bottleneck in the era of large language models. However, existing methods struggle with either computational complexity or model expressivity, rendering the maximum sequence length restricted. In this paper, they introduce LongNet, a Transformer variant that can scale sequence length to more than 1 billion tokens, without sacrificing the performance on shorter sequences. Specifically, they propose dilated attention, which expands the attentive field exponentially as the distance grows. -* Recreate in Triton or Jax for ultra mega speed boost +## Features +LongNet has significant advantages: +1. It has a linear computation complexity and a logarithm dependency between tokens. +2. It can be served as a distributed trainer for extremely long sequences. +3. Its dilated attention is a drop-in replacement for standard attention, which can be seamlessly integrated with the existing Transformer-based optimization. -* Integrate [Dynamic sparse flash attention](https://github.com/epfml/dynamic-sparse-flash-attention/blob/main/runtime-experiments/timeperf-hash-and-qk-sparse.ipynb) with DilatedAttention +Experiment results demonstrate that LongNet yields strong performance on both long-sequence modeling and general language tasks. Their work opens up new possibilities for modeling very long sequences, e.g., treating a whole corpus or even the entire Internet as a sequence. ## Citation -``` +```bibtex @inproceedings{ding2023longnet, title={LongNet: Scaling Transformers to 1,000,000,000 Tokens}, author={Ding, Jiayu and Ma, Shuming and Dong, Li and Zhang, Xingxing and Huang, Shaohan and Wang, Wenhui and Wei, Furu}, @@ -324,3 +106,10 @@ Thank you for sharing! } ``` +----- + +# Todo + +- [ ] Fix the ParallelTransformer Block's forward pass with dilated attn +- [ ] Train on enwiki 8 and test +- [ ] Create multihead iteration diff --git a/agora-banner-water.png b/agora-banner-water.png deleted file mode 100644 index 86fb95d..0000000 Binary files a/agora-banner-water.png and /dev/null differ diff --git a/benchmark/test.py b/benchmark/test.py deleted file mode 100644 index 6b9af5f..0000000 --- a/benchmark/test.py +++ /dev/null @@ -1,38 +0,0 @@ -import timeit -import torch -from LongNet import DilatedAttention - - -#model config -d_model = 512 -num_heads = 8 -dilation_rate = 2 -segment_size = 64 - -device = "cuda:0" -dtype=torch.float16 - -#input data -batch_size = 32 -seq_len = 1024 - - -#create model and data -model = DilatedAttention(d_model, num_heads, dilation_rate, segment_size).to(device) -x = torch.randn((batch_size, seq_len, d_model), device=device, dtype=dtype) - - -#test forward pass -with torch.no_grad(): - output = model(x) - print(f"Output shape: {output.shape}") # expected (batch_size, seq_Len) - - -#benchmark model -num_runs = 1000 -start_time = timeit.default_timer() -for _ in range(num_runs): - model(x) - -elapsed_time = timeit.default_timer() - start_time -print(f"Average forward pass time: {elapsed_time / num_runs:.6f} seconds") \ No newline at end of file diff --git a/data/enwik8.gz b/data/enwik8.gz new file mode 100644 index 0000000..7a8ec66 Binary files /dev/null and b/data/enwik8.gz differ diff --git a/example.py b/example.py index 3a7ca34..c10c264 100644 --- a/example.py +++ b/example.py @@ -1,38 +1,21 @@ -import timeit import torch -from LongNet.attention import DilatedAttention +from long_net import DilatedAttention -#model config -d_model = 512 -num_heads = 8 +# model config +dim = 512 +heads = 8 dilation_rate = 2 segment_size = 64 -device = "cuda:0" -dtype=torch.float16 - -#input data +# input data batch_size = 32 -seq_len = 1024 - - -#create model and data -model = DilatedAttention(d_model, num_heads, dilation_rate, segment_size).to(device) -x = torch.randn((batch_size, seq_len, d_model), device=device, dtype=dtype) - - -#test forward pass -with torch.no_grad(): - output = model(x) - print(f"Output shape: {output.shape}") # expected (batch_size, seq_Len) +seq_len = 8192 -#benchmark model -num_runs = 1000 -start_time = timeit.default_timer() -for _ in range(num_runs): - model(x) +# create model and data +model = DilatedAttention(dim, heads, dilation_rate, segment_size, qk_norm=True) +x = torch.randn((batch_size, seq_len, dim)) -elapsed_time = timeit.default_timer() - start_time -print(f"Average forward pass time: {elapsed_time / num_runs:.6f} seconds") +output = model(x) +print(output) diff --git a/images/agorabanner.png b/images/agorabanner.png new file mode 100644 index 0000000..030ad15 Binary files /dev/null and b/images/agorabanner.png differ diff --git a/images/longnet.jpg b/images/longnet.jpg new file mode 100644 index 0000000..55aa0cf Binary files /dev/null and b/images/longnet.jpg differ diff --git a/long_net/__init__.py b/long_net/__init__.py new file mode 100644 index 0000000..409776a --- /dev/null +++ b/long_net/__init__.py @@ -0,0 +1,4 @@ +from long_net.attention import DilatedAttention +from long_net.model import LongNetTransformer + +__all__ = ["DilatedAttention", "LongNetTransformer"] diff --git a/long_net/attention.py b/long_net/attention.py new file mode 100644 index 0000000..f901702 --- /dev/null +++ b/long_net/attention.py @@ -0,0 +1,137 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +# from long_net.attend import FlashAttention +from zeta.nn.attention.flash_attention import FlashAttention +from long_net.utils import XPOS, RelativePositionBias + + +# add alibi, qk layer norm, one write head, multihway, +class DilatedAttention(nn.Module): + """ + Dilated Attention Module. + + Arguments: + dim: The dimension of the attention layers. + heads: The number of attention heads. + dilation_rate: The dilation rate for dilated attention. + segment_size: The segment size for dilated attention. + dropout (optional): The dropout probability. Default: 0.0 + causal (optional): If set to True, the attention mechanism is causal. Default: False + use_xpos (optional): If set to True, xpos is used for positional encoding. Default: False + use_rel_pos_bias (optional): If set to True, relative position bias is used in the attention mechanism. Default: False + + Usage: + The `DilatedAttention` class can be used as a module for neural networks and is especially suited for transformer architectures. + + Example: + attention = DilatedAttention(dim=512, heads=8, dilation_rate=2, segment_size=64, use_xpos=True, use_rel_pos_bias=True) + output = attention(input_tensor) + + This will return the output tensor after applying dilated attention. The `use_xpos` and `use_rel_pos_bias` parameters allow for switching on positional encoding and relative positional bias respectively. + """ + + def __init__( + self, + dim: int, + heads: int, + dilation_rate: int, + segment_size: int, + dropout: float = 0.0, + causal: bool = False, + use_xpos: bool = False, + use_rel_pos_bias: bool = False, + qk_norm: bool = False, + dtype: torch.dtype = torch.float16, + device: str = "cuda:0", + ) -> None: + super(DilatedAttention, self).__init__() + self.dim = dim + self.heads = heads + self.dilation_rate = dilation_rate + self.segment_size = segment_size + self.dropout = nn.Dropout(dropout) + self.causal = causal + self.use_xpos = use_xpos + self.use_rel_pos_bias = use_rel_pos_bias + self.qk_norm = qk_norm + self.dtype = dtype + self.device = device + + self.attention = FlashAttention(causal=self.causal, dropout=dropout).to( + device + ) + + if use_xpos: + self.xpos = XPOS(head_dim=dim // heads) + if use_rel_pos_bias: + self.relative_bias = RelativePositionBias( + num_buckets=32, max_distance=128, n_heads=heads + ) + + self.norm = nn.LayerNorm(dim) + + # head offsets + self.head_offsets = nn.Parameter(torch.randn(heads, dim)) + + # Linear Projections + self.proj_q = nn.Linear(dim, dim) + self.proj_k = nn.Linear(dim, dim) + self.proj_v = nn.Linear(dim, dim) + + def get_mask(self, i, j): + """i = row, j=column""" + return torch.ones((i, j), device=self.device, dtype=torch.bool).triu( + j - i + 2 + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward pass of the DilatedAttention module. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The output tensor. + """ + batch_size, seq_len, _ = x.shape + padding_len = -seq_len % self.segment_size + x = F.pad(x, (0, 0, 0, padding_len)) + seq_len = seq_len + padding_len + + if self.use_xpos: + x = self.xpos(x) + + # Split and sparsify + x = x.view(batch_size, -1, self.segment_size, self.dim) + x = x[:, :, :: self.dilation_rate, :] + + # qk_norm + if self.qk_norm: + q, k, v = map( + self.norm, (self.proj_q(x), self.proj_k(x), self.proj_v(x)) + ) + else: + q, k, v = self.proj_q(x), self.proj_k(x), self.proj_v(x) + + # Perform attention + attn_output = self.attention(q, k, v) + + # if use rel pos => apply relative positioning bias + if self.use_rel_pos_bias: + attn_output += self.relative_bias( + batch_size, attn_output.size(1), attn_output.size(1) + ) + + # if causal create a mask and apply to the output + if self.causal: + mask = self.get_mask(attn_output.size(1), attn_output.size(1)) + + attn_output = attn_output.masked_fill(mask, float("-inf")) + + # apply dropout + attn_output = self.dropout(attn_output) + # Scatter and concatenate + attn_output = attn_output.reshape(batch_size, -1, self.dim) + return attn_output diff --git a/long_net/model.py b/long_net/model.py new file mode 100644 index 0000000..faad2e6 --- /dev/null +++ b/long_net/model.py @@ -0,0 +1,357 @@ +import torch +import torch.nn.functional as F +from einops import rearrange +from torch import einsum, nn + +# from long_net.attention import DilatedAttention +from long_net.attention import DilatedAttention + + +# helpers +def exists(val): + return val is not None + + +def eval_decorator(fn): + def inner(model, *args, **kwargs): + was_training = model.training + model.eval() + out = fn(model, *args, **kwargs) + model.train(was_training) + return out + + return inner + + +# top k filtering + + +def top_k(logits, thres=0.9): + k = int((1 - thres) * logits.shape[-1]) + val, ind = torch.topk(logits, k) + probs = torch.full_like(logits, float("-inf")) + probs.scatter_(1, ind, val) + return probs + + +# normalization +# they use layernorm without bias, something that pytorch does not offer + + +class LayerNorm(nn.Module): + def __init__(self, dim): + super().__init__() + self.gamma = nn.Parameter(torch.ones(dim)) + self.register_buffer("beta", torch.zeros(dim)) + + def forward(self, x): + return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta) + + +# residual +# normalization +class RMSNorm(nn.Module): + def __init__(self, dim, eps=1e-8): + super().__init__() + self.scale = dim**-0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class Residual(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x): + return self.fn(x) + x + + +# rotary positional embedding +# https://arxiv.org/abs/2104.09864 + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim): + super().__init__() + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer("inv_freq", inv_freq) + + def forward(self, max_seq_len, *, device): + seq = torch.arange( + max_seq_len, device=device, dtype=self.inv_freq.dtype + ) + freqs = einsum("i , j -> i j", seq, self.inv_freq) + return torch.cat((freqs, freqs), dim=-1) + + +def rotate_half(x): + x = rearrange(x, "... (j d) -> ... j d", j=2) + x1, x2 = x.unbind(dim=-2) + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(pos, t): + return (t * pos.cos()) + (rotate_half(t) * pos.sin()) + + +# classic Noam Shazeer paper, except here they use SwiGLU instead of the more popular GEGLU for gating the feedforward +# https://arxiv.org/abs/2002.05202 + + +class SwiGLU(nn.Module): + def forward(self, x): + x, gate = x.chunk(2, dim=-1) + return F.silu(gate) * x + + +# parallel attention and feedforward with residual +# discovered by Wang et al + EleutherAI from GPT-J fame + +# Assuming necessary imports like RotaryEmbedding, SwiGLU, etc. are present + + +def FeedForward(dim, hidden_dim, dropout=0.0): + return nn.Sequential( + nn.LayerNorm(dim), + nn.Linear(dim, hidden_dim), + nn.GELU(), + nn.Linear(hidden_dim, dim), + nn.Dropout(dropout), + ) + + +class ParallelTransformerBlock(nn.Module): + def __init__( + self, + dim, + dim_head=64, + dilation_rate: int = 2, + segment_size: int = 64, + heads=8, + ff_mult=4, + *args, + **kwargs, + ): + super().__init__() + self.norm = LayerNorm(dim) + + attn_inner_dim = dim_head * heads + ff_inner_dim = dim * ff_mult + self.fused_dims = ( + attn_inner_dim, + dim_head, + dim_head, + (ff_inner_dim * 2), + ) + + self.heads = heads + self.scale = dim_head**-0.5 + self.rotary_emb = RotaryEmbedding(dim_head) + + self.fused_attn_ff_proj = nn.Linear( + dim, sum(self.fused_dims), bias=False + ) + self.attn_out = nn.Linear(attn_inner_dim, dim, bias=False) + + self.attn = DilatedAttention( + dim, + heads, + dilation_rate, + segment_size, + qk_norm=True, + *args, + **kwargs, + ) + + self.ff_out = nn.Sequential( + SwiGLU(), nn.Linear(ff_inner_dim, dim, bias=False) + ) + + # for caching causal mask and rotary embeddings + + self.register_buffer("mask", None, persistent=False) + self.register_buffer("pos_emb", None, persistent=False) + + self.proj_q = nn.Linear(dim, dim) + self.proj_k = nn.Linear(dim, dim) + self.proj_v = nn.Linear(dim, dim) + + def get_mask(self, n, device): + if self.mask is not None and self.mask.shape[-1] >= n: + return self.mask[:n, :n] + + mask = torch.ones((n, n), device=device, dtype=torch.bool).triu(1) + self.register_buffer("mask", mask, persistent=False) + return mask + + def get_rotary_embedding(self, n, device): + if self.pos_emb is not None and self.pos_emb.shape[-2] >= n: + return self.pos_emb[:n] + + pos_emb = self.rotary_emb(n, device=device) + self.register_buffer("pos_emb", pos_emb, persistent=False) + return pos_emb + + def forward(self, x): + """ + einstein notation + b - batch + h - heads + n, i, j - sequence length (base sequence length, source, target) + d - feature dimension + """ + + n, device, h = x.shape[1], x.device, self.heads + + # pre layernorm + + x = self.norm(x) + + # attention queries, keys, values, and feedforward inner + + q = self.proj_q(x) + k = self.proj_k(x) + v = self.proj_v(x) + + # attention + + attn = self.attn(x) + + # # aggregate values + + # out = einsum("b h i j, b j d -> b h i d", attn, v) + + # # merge heads + + # out = rearrange(out, "b h n d -> b n (h d)") + return attn + + +# Transformer +class Transformer(nn.Module): + def __init__( + self, + dim, + depth, + heads, + dim_head, + ff_mult=4, + dilation_rate: int = 2, + segment_size: int = 64, + ): + super().__init__() + self.layers = nn.ModuleList([]) + + self.feedforward = (FeedForward(dim, dim, dropout=0.1),) + + for _ in range(depth): + self.layers.append( + nn.ModuleList( + [ + ParallelTransformerBlock( + dim, + dim_head, + dilation_rate, + segment_size, + heads, + ff_mult, + ), + FeedForward(dim, dim, dropout=0.1), + ] + ) + ) + + def forward(self, x): + for block, ff in self.layers: + x = block(x) + x + x = ff(x) + x + return x + + +# classes + + +class LongNetTransformer(nn.Module): + def __init__( + self, + dim, + depth, + num_tokens, + dim_head=64, + heads=8, + ff_mult=4, + dilation_rate: int = 2, + segment_size: int = 64, + ): + super().__init__() + self.emb = nn.Embedding(num_tokens, dim) + + self.transformer = Transformer( + dim, depth, heads, dim_head, ff_mult, dilation_rate, segment_size + ) + + self.to_logits = nn.Sequential(RMSNorm(dim), nn.Linear(dim, num_tokens)) + + def forward(self, x): + x = self.emb(x) + x = self.transformer(x) + return self.to_logits(x) + + +# autoregressive wrapper + + +class AutoregressiveWrapper(nn.Module): + def __init__(self, net, max_seq_len=2048, pad_value=0): + super().__init__() + self.max_seq_len = max_seq_len + self.pad_value = pad_value + self.net = net + + @torch.no_grad() + @eval_decorator + def generate( + self, + start_tokens, + seq_len, + eos_token=None, + temperature=1.0, + filter_thres=0.9, + **kwargs, + ): + b, t, device = *start_tokens.shape, start_tokens.device + + out = start_tokens + + for _ in range(seq_len): + logits = self.net(out, **kwargs)[:, -1, :] + + filtered_logits = top_k(logits, thres=filter_thres) + probs = F.softmax(filtered_logits / temperature, dim=-1) + + sample = torch.multinomial(probs, 1) + + out = torch.cat((out, sample), dim=-1) + + if exists(eos_token): + is_eos_token = out == eos_token + + if is_eos_token.any(dim=-1).all(): + # mask out everything after the eos tokens + shifted_is_eos_tokens = F.pad(is_eos_token, (1, -1)) + mask = shifted_is_eos_tokens.float().cumsum(dim=-1) >= 1 + out = out.masked_fill(mask, self.pad_value) + break + + out = out[:, t:] + return out + + def forward(self, x, **kwargs): + x_inp, x_labels = x[:, :-1], x[:, 1:] + logits = self.net(x_inp, **kwargs) + return F.cross_entropy(rearrange(logits, "b c n -> b n c"), x_labels) diff --git a/LongNet/utils.py b/long_net/utils.py similarity index 55% rename from LongNet/utils.py rename to long_net/utils.py index 1394b8f..b8cb43f 100644 --- a/LongNet/utils.py +++ b/long_net/utils.py @@ -1,7 +1,8 @@ -import numpy as np -import torch +import math +from typing import List, Optional, Tuple, Union -# This is the unfused version of StableAdamW. It is slower than the fused version (coming). +import torch +import torch.nn as nn class StableAdamWUnfused(torch.optim.Optimizer): @@ -17,7 +18,9 @@ def __init__( custom_scalar=65536, ): beta1, beta2 = betas[0], betas[1] - defaults = dict(lr=lr, weight_decay=weight_decay, beta1=beta1, beta2=beta2) + defaults = dict( + lr=lr, weight_decay=weight_decay, beta1=beta1, beta2=beta2 + ) super(StableAdamWUnfused, self).__init__(params, defaults) self.eps = eps @@ -37,9 +40,8 @@ def __setstate__(self, state): super(StableAdamWUnfused, self).__setstate__(state) def step(self, closure=None): - loss = None if closure is not None: - loss = closure() + closure() for group in self.param_groups: lr = group["lr"] @@ -68,8 +70,12 @@ def step(self, closure=None): v = param_state["exp_avg"] u = param_state["exp_avg_sq"] - beta1hat = beta1 * (1 - beta1 ** (step - 1)) / (1 - beta1**step) - beta2hat = beta2 * (1 - beta2 ** (step - 1)) / (1 - beta2**step) + beta1hat = ( + beta1 * (1 - beta1 ** (step - 1)) / (1 - beta1**step) + ) + beta2hat = ( + beta2 * (1 - beta2 ** (step - 1)) / (1 - beta2**step) + ) v = v.mul_(beta1hat).add_(g, alpha=1.0 - beta1hat) u = u.mul_(beta2hat).addcmul_(g, g, value=1.0 - beta2hat) @@ -79,7 +85,8 @@ def step(self, closure=None): # StableAdamW = AdamW + update clipping (https://arxiv.org/abs/1804.04235) applied tensor-wise. rms = ( torch.div( - g.pow(2), torch.maximum(u, (self.eps**2) * torch.ones_like(u)) + g.pow(2), + torch.maximum(u, (self.eps**2) * torch.ones_like(u)), ) .mean() .sqrt() @@ -97,11 +104,6 @@ def step(self, closure=None): group["step"] = step + 1 - -import math -import torch.nn as nn - - class RelativePositionBias(nn.Module): def __init__( self, bidirectional=True, num_buckets=32, max_distance=128, n_heads=12 @@ -111,7 +113,9 @@ def __init__( self.num_buckets = num_buckets self.max_distance = max_distance self.n_heads = n_heads - self.relative_attention_bias = nn.Embedding(self.num_buckets, self.n_heads) + self.relative_attention_bias = nn.Embedding( + self.num_buckets, self.n_heads + ) @staticmethod def _relative_position_bucket( @@ -150,9 +154,13 @@ def compute_bias(self, qlen, klen, step=None): device=self.relative_attention_bias.weight.device, )[:, None] memory_position = torch.arange( - klen, dtype=torch.long, device=self.relative_attention_bias.weight.device + klen, + dtype=torch.long, + device=self.relative_attention_bias.weight.device, )[None, :] - relative_position = memory_position - context_position # shape (qlen, klen) + relative_position = ( + memory_position - context_position + ) # shape (qlen, klen) rp_bucket = self._relative_position_bucket( relative_position, # shape (qlen, klen) @@ -163,14 +171,14 @@ def compute_bias(self, qlen, klen, step=None): rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device) values = self.relative_attention_bias( rp_bucket - ) # shape (qlen, klen, num_heads) + ) # shape (qlen, klen, heads) values = values.permute([2, 0, 1]).unsqueeze( 0 - ) # shape (1, num_heads, qlen, klen) + ) # shape (1, heads, qlen, klen) return values def forward(self, batch_size, qlen, klen, step=None): - # shape (batch * num_heads, qlen, klen) + # shape (batch * heads, qlen, klen) return ( self.compute_bias(qlen, klen, step) .repeat(batch_size, 1, 1, 1) @@ -178,24 +186,23 @@ def forward(self, batch_size, qlen, klen, step=None): ) - -import numpy as np -import torch -import torch.nn as nn - def fixed_pos_embedding(x): seq_len, dim = x.shape inv_freq = 1.0 / (10000 ** (torch.arange(0, dim) / dim)) - sinusoid_inp = ( - torch.einsum("i , j -> i j", torch.arange(0, seq_len, dtype=torch.float), inv_freq).to(x) - ) + sinusoid_inp = torch.einsum( + "i , j -> i j", torch.arange(0, seq_len, dtype=torch.float), inv_freq + ).to(x) return torch.sin(sinusoid_inp), torch.cos(sinusoid_inp) + def rotate_every_two(x): x1 = x[:, :, ::2] x2 = x[:, :, 1::2] x = torch.stack((-x2, x1), dim=-1) - return x.flatten(-2) # in einsum notation: rearrange(x, '... d j -> ... (d j)')\ + return x.flatten( + -2 + ) # in einsum notation: rearrange(x, '... d j -> ... (d j)')\ + def duplicate_interleave(m): """ @@ -207,6 +214,7 @@ def duplicate_interleave(m): m = m.view(dim0, -1) # reshape into a matrix, interleaving the copy return m + def apply_rotary_pos_emb(x, sin, cos, scale=1): sin, cos = map(lambda t: duplicate_interleave(t * scale), (sin, cos)) # einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2) @@ -214,30 +222,149 @@ def apply_rotary_pos_emb(x, sin, cos, scale=1): class XPOS(nn.Module): - def __init__( - self, head_dim, scale_base=512 - ): + def __init__(self, head_dim, scale_base=512): super().__init__() self.head_dim = head_dim self.scale_base = scale_base self.register_buffer( - "scale", (torch.arange(0, head_dim, 2) + 0.4 * head_dim) / (1.4 * head_dim) + "scale", + (torch.arange(0, head_dim, 2) + 0.4 * head_dim) / (1.4 * head_dim), ) def forward(self, x, offset=0, downscale=False): length = x.shape[1] min_pos = -(length + offset) // 2 max_pos = length + offset + min_pos - scale = self.scale ** torch.arange(min_pos, max_pos, 1).to(self.scale).div(self.scale_base)[:, None] + scale = ( + self.scale + ** torch.arange(min_pos, max_pos, 1) + .to(self.scale) + .div(self.scale_base)[:, None] + ) sin, cos = fixed_pos_embedding(scale) if scale.shape[0] > length: scale = scale[-length:] sin = sin[-length:] cos = cos[-length:] - + if downscale: scale = 1 / scale x = apply_rotary_pos_emb(x, sin, cos, scale) return x + + +def SparsifyIndices( + x: torch.Tensor, ws: List[int], rs: List[int], head_idx: int +) -> Tuple[int, torch.Tensor, Optional[torch.Tensor]]: + b, n, c = x.size() + + print(f"x.size 1st: {x.shape} and xdtype: {x.dtype}") + + x_indices = torch.arange(0, n, dtype=torch.long, device=x.device)[ + None, :, None + ] + print(f"X indices dtype: {x_indices.shape} and dtype: {x.dtype}") + + num_subatt = sum([int(math.ceil(n / w)) for w in ws]) + max_subatt_n = min(n, max([w // r for w, r in zip(ws, rs)])) + + sparse_indices = -1 * torch.ones( + (b, num_subatt * max_subatt_n, c), device=x.device, dtype=torch.int64 + ) + print( + f"Sparse indices shape and dtype: {sparse_indices.shape} and dtype:" + f" {sparse_indices.dtype}" + ) + + subatt_idx = 0 + for w, r in zip(ws, rs): + for segment_indices in torch.split(x_indices, w, 1): + offset = head_idx % r + cur_sparse_indices = segment_indices[:, offset::r, :] + print( + f"Current sparse indices shape {cur_sparse_indices.shape} and" + f" dtype: {cur_sparse_indices.dtype}" + ) + start_idx = subatt_idx * max_subatt_n + end_idx = start_idx + cur_sparse_indices.shape[1] + sparse_indices[:, start_idx:end_idx] = cur_sparse_indices + subatt_idx += 1 + + if -1 in sparse_indices: + padding_mask = sparse_indices[:, :, 0] != -1 + + # to allow gather work for batching + sparse_indices[~padding_mask] = 0 + + # combine batch and subattention dims + print( + f"Padding mask shape: {padding_mask.shape} and dtype:" + f" {padding_mask.dtype}" + ) + padding_mask = padding_mask.view((-1, max_subatt_n)) + else: + padding_mask = None + + return max_subatt_n, sparse_indices, padding_mask + + +def MixOutputs( + out_shape: Tuple[int, int, int], + out_dtype: torch.dtype, + out_device: Union[torch.device, str], + a_os: torch.Tensor, + a_denoms: torch.Tensor, + a_indices: torch.Tensor, +) -> torch.Tensor: + print(f"Input 'a_os' shape: {a_os.shape} and dtype: {a_os.dtype}") + print( + f"Input 'a_denoms' shape: {a_denoms.shape} and dtype: {a_denoms.dtype}" + ) + print( + f"Input 'a_indices' shape: {a_indices.shape} and dtype:" + f" {a_indices.dtype}" + ) + + # Ensure the source tensor has the same dtype as the target tensor before the scatter operation + a_denoms = a_denoms.to(out_dtype) + print(f"Converted 'a_denoms' dtype: {a_denoms.dtype}") + + # explicitly define the shape of att_denom_sums + att_denom_sums_shape = (out_shape[0], out_shape[1]) + print(f"Att_denom_sums shape to be initialized: {att_denom_sums_shape}") + + # calculate sums of softmax denominators + att_denom_sums = torch.zeros( + att_denom_sums_shape, device=out_device, dtype=out_dtype + ) + print( + f"Initialized 'att_denom_sums' shape: {att_denom_sums.shape} and dtype:" + f" {att_denom_sums.dtype}" + ) + + # Use scatter_add_ without unsqueezing a_denoms + att_denom_sums.scatter_add_(1, a_indices[:, :, 0].squeeze(-1), a_denoms) + + # select attention softmax denominator sums for current sparse indices + sparse_att_denom_sum = torch.gather(att_denom_sums, 1, a_indices[:, :, 0]) + print( + f"'sparse_att_denom_sum' shape: {sparse_att_denom_sum.shape} and dtype:" + f" {sparse_att_denom_sum.dtype}" + ) + + # compute alphas + alphas = torch.divide(a_denoms, sparse_att_denom_sum)[:, :, None] + print(f"Alphas shape: {alphas.shape} and dtype: {alphas.dtype}") + + out = torch.zeros(out_shape, dtype=out_dtype, device=out_device) + print(f"Initialized 'out' shape: {out.shape} and dtype: {out.dtype}") + + out.scatter_add_( + 1, + a_indices[:, :, : out.shape[2]], + torch.multiply(a_os, alphas), + ) + + return out diff --git a/longnet_transformer.py b/longnet_transformer.py new file mode 100644 index 0000000..642c6ee --- /dev/null +++ b/longnet_transformer.py @@ -0,0 +1,15 @@ +import torch +from long_net.model import LongNetTransformer + +longnet = LongNetTransformer( + num_tokens=20000, + dim=512, + depth=6, + dim_head=64, + heads=8, + ff_mult=4, +) + +tokens = torch.randint(0, 20000, (1, 512)) +logits = longnet(tokens) +print(logits) diff --git a/notebook.ipynb b/notebook.ipynb deleted file mode 100644 index 34c98e2..0000000 --- a/notebook.ipynb +++ /dev/null @@ -1,5320 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7fk13NtwL5-u" - }, - "outputs": [], - "source": [ - "!git clone https://github.com/kyegomez/LongNet.git\n", - "\n", - "%cd LongNet\n", - "\n", - "!pip install -r requirements.txt\n", - "\n", - "%cd test\n", - "\n", - "!python attention.py\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "NC6o2ezmxOIT", - "outputId": "7d2d0d64-ab67-41c8-a88e-020385b598ff" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.2)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.6.3)\n", - "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", - "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n", - "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n", - "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.6)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", - "Requirement already satisfied: einops in /usr/local/lib/python3.10/dist-packages/einops-0.7.0rc1-py3.10.egg (0.7.0rc1)\n", - "Requirement already satisfied: torchscale in /usr/local/lib/python3.10/dist-packages (0.2.0)\n", - "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from torchscale) (2.0.1+cu118)\n", - "Requirement already satisfied: fairscale==0.4.0 in /usr/local/lib/python3.10/dist-packages (from torchscale) (0.4.0)\n", - "Requirement already satisfied: timm==0.4.12 in /usr/local/lib/python3.10/dist-packages (from torchscale) (0.4.12)\n", - "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from timm==0.4.12->torchscale) (0.15.2+cu118)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->torchscale) (3.12.2)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->torchscale) (4.6.3)\n", - "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->torchscale) (1.11.1)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->torchscale) (3.1)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->torchscale) (3.1.2)\n", - "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->torchscale) (2.0.0)\n", - "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.8->torchscale) (3.25.2)\n", - "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.8->torchscale) (16.0.6)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.8->torchscale) (2.1.3)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->torchscale) (1.3.0)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchvision->timm==0.4.12->torchscale) (1.22.4)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from torchvision->timm==0.4.12->torchscale) (2.27.1)\n", - "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->timm==0.4.12->torchscale) (8.4.0)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision->timm==0.4.12->torchscale) (1.26.16)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision->timm==0.4.12->torchscale) (2023.5.7)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision->timm==0.4.12->torchscale) (2.0.12)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision->timm==0.4.12->torchscale) (3.4)\n", - "Cloning into 'flash-attention'...\n", - "remote: Enumerating objects: 2743, done.\u001b[K\n", - "remote: Counting objects: 100% (1170/1170), done.\u001b[K\n", - "remote: Compressing objects: 100% (200/200), done.\u001b[K\n", - "remote: Total 2743 (delta 1016), reused 1005 (delta 968), pack-reused 1573\u001b[K\n", - "Receiving objects: 100% (2743/2743), 3.16 MiB | 14.71 MiB/s, done.\n", - "Resolving deltas: 100% (1817/1817), done.\n", - "/content/flash-attention/flash-attention/flash-attention/flash-attention\n", - "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n", - "\n", - "Warning: Torch did not find available GPUs on this system.\n", - " If your intention is to cross-compile, this is not an error.\n", - "By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n", - "Volta (compute capability 7.0), Turing (compute capability 7.5),\n", - "and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n", - "If you wish to cross-compile for a single specific architecture,\n", - "export TORCH_CUDA_ARCH_LIST=\"compute capability\" before running setup.py.\n", - "\n", - "\n", - "\n", - "torch.__version__ = 2.0.1+cu118\n", - "\n", - "\n", - "Submodule 'csrc/flash_attn/cutlass' (https://github.com/NVIDIA/cutlass.git) registered for path 'csrc/flash_attn/cutlass'\n", - "Cloning into '/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/cutlass'...\n", - "Submodule path 'csrc/flash_attn/cutlass': checked out '319a389f42b776fae5701afcb943fc03be5b5c25'\n", - "running install\n", - "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: SetuptoolsDeprecationWarning: setup.py install is deprecated.\n", - "!!\n", - "\n", - " ********************************************************************************\n", - " Please avoid running ``setup.py`` directly.\n", - " Instead, use pypa/build, pypa/installer, pypa/build or\n", - " other standards-based tools.\n", - "\n", - " See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.\n", - " ********************************************************************************\n", - "\n", - "!!\n", - " self.initialize_options()\n", - "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: EasyInstallDeprecationWarning: easy_install command is deprecated.\n", - "!!\n", - "\n", - " ********************************************************************************\n", - " Please avoid running ``setup.py`` and ``easy_install``.\n", - " Instead, use pypa/build, pypa/installer, pypa/build or\n", - " other standards-based tools.\n", - "\n", - " See https://github.com/pypa/setuptools/issues/917 for details.\n", - " ********************************************************************************\n", - "\n", - "!!\n", - " self.initialize_options()\n", - "running bdist_egg\n", - "running egg_info\n", - "creating flash_attn.egg-info\n", - "writing flash_attn.egg-info/PKG-INFO\n", - "writing dependency_links to flash_attn.egg-info/dependency_links.txt\n", - "writing requirements to flash_attn.egg-info/requires.txt\n", - "writing top-level names to flash_attn.egg-info/top_level.txt\n", - "writing manifest file 'flash_attn.egg-info/SOURCES.txt'\n", - "/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:476: UserWarning: Attempted to use ninja as the BuildExtension backend but we could not find ninja.. Falling back to using the slow distutils backend.\n", - " warnings.warn(msg.format('we could not find ninja.'))\n", - "reading manifest file 'flash_attn.egg-info/SOURCES.txt'\n", - "reading manifest template 'MANIFEST.in'\n", - "warning: no files found matching '*.cu' under directory 'flash_attn'\n", - "warning: no files found matching '*.h' under directory 'flash_attn'\n", - "warning: no files found matching '*.cuh' under directory 'flash_attn'\n", - "warning: no files found matching '*.cpp' under directory 'flash_attn'\n", - "adding license file 'LICENSE'\n", - "adding license file 'AUTHORS'\n", - "writing manifest file 'flash_attn.egg-info/SOURCES.txt'\n", - "installing library code to build/bdist.linux-x86_64/egg\n", - "running install_lib\n", - "running build_py\n", - "creating build\n", - "creating build/lib.linux-x86_64-cpython-310\n", - "creating build/lib.linux-x86_64-cpython-310/flash_attn\n", - "copying flash_attn/bert_padding.py -> build/lib.linux-x86_64-cpython-310/flash_attn\n", - "copying flash_attn/flash_blocksparse_attention.py -> build/lib.linux-x86_64-cpython-310/flash_attn\n", - "copying flash_attn/flash_attn_interface.py -> build/lib.linux-x86_64-cpython-310/flash_attn\n", - "copying flash_attn/flash_blocksparse_attn_interface.py -> build/lib.linux-x86_64-cpython-310/flash_attn\n", - "copying flash_attn/flash_attn_triton_og.py -> build/lib.linux-x86_64-cpython-310/flash_attn\n", - "copying flash_attn/flash_attention.py -> build/lib.linux-x86_64-cpython-310/flash_attn\n", - "copying flash_attn/__init__.py -> build/lib.linux-x86_64-cpython-310/flash_attn\n", - "copying flash_attn/flash_attn_triton.py -> build/lib.linux-x86_64-cpython-310/flash_attn\n", - "copying flash_attn/fused_softmax.py -> build/lib.linux-x86_64-cpython-310/flash_attn\n", - "creating build/lib.linux-x86_64-cpython-310/flash_attn/losses\n", - "copying flash_attn/losses/__init__.py -> build/lib.linux-x86_64-cpython-310/flash_attn/losses\n", - "copying flash_attn/losses/cross_entropy.py -> build/lib.linux-x86_64-cpython-310/flash_attn/losses\n", - "creating build/lib.linux-x86_64-cpython-310/flash_attn/layers\n", - "copying flash_attn/layers/__init__.py -> build/lib.linux-x86_64-cpython-310/flash_attn/layers\n", - "copying flash_attn/layers/rotary.py -> build/lib.linux-x86_64-cpython-310/flash_attn/layers\n", - "copying flash_attn/layers/patch_embed.py -> build/lib.linux-x86_64-cpython-310/flash_attn/layers\n", - "creating build/lib.linux-x86_64-cpython-310/flash_attn/utils\n", - "copying flash_attn/utils/distributed.py -> build/lib.linux-x86_64-cpython-310/flash_attn/utils\n", - "copying flash_attn/utils/__init__.py -> build/lib.linux-x86_64-cpython-310/flash_attn/utils\n", - "copying flash_attn/utils/generation.py -> build/lib.linux-x86_64-cpython-310/flash_attn/utils\n", - "copying flash_attn/utils/benchmark.py -> build/lib.linux-x86_64-cpython-310/flash_attn/utils\n", - "copying flash_attn/utils/pretrained.py -> build/lib.linux-x86_64-cpython-310/flash_attn/utils\n", - "creating build/lib.linux-x86_64-cpython-310/flash_attn/ops\n", - "copying flash_attn/ops/activations.py -> build/lib.linux-x86_64-cpython-310/flash_attn/ops\n", - "copying flash_attn/ops/fused_dense.py -> build/lib.linux-x86_64-cpython-310/flash_attn/ops\n", - "copying flash_attn/ops/__init__.py -> build/lib.linux-x86_64-cpython-310/flash_attn/ops\n", - "copying flash_attn/ops/rms_norm.py -> build/lib.linux-x86_64-cpython-310/flash_attn/ops\n", - "copying flash_attn/ops/layer_norm.py -> build/lib.linux-x86_64-cpython-310/flash_attn/ops\n", - "creating build/lib.linux-x86_64-cpython-310/flash_attn/models\n", - "copying flash_attn/models/gptj.py -> build/lib.linux-x86_64-cpython-310/flash_attn/models\n", - "copying flash_attn/models/vit.py -> build/lib.linux-x86_64-cpython-310/flash_attn/models\n", - "copying flash_attn/models/bert.py -> build/lib.linux-x86_64-cpython-310/flash_attn/models\n", - "copying flash_attn/models/gpt.py -> build/lib.linux-x86_64-cpython-310/flash_attn/models\n", - "copying flash_attn/models/llama.py -> build/lib.linux-x86_64-cpython-310/flash_attn/models\n", - "copying flash_attn/models/__init__.py -> build/lib.linux-x86_64-cpython-310/flash_attn/models\n", - "copying flash_attn/models/opt.py -> build/lib.linux-x86_64-cpython-310/flash_attn/models\n", - "copying flash_attn/models/gpt_neox.py -> build/lib.linux-x86_64-cpython-310/flash_attn/models\n", - "creating build/lib.linux-x86_64-cpython-310/flash_attn/modules\n", - "copying flash_attn/modules/mha.py -> build/lib.linux-x86_64-cpython-310/flash_attn/modules\n", - "copying flash_attn/modules/block.py -> build/lib.linux-x86_64-cpython-310/flash_attn/modules\n", - "copying flash_attn/modules/mlp.py -> build/lib.linux-x86_64-cpython-310/flash_attn/modules\n", - "copying flash_attn/modules/__init__.py -> build/lib.linux-x86_64-cpython-310/flash_attn/modules\n", - "copying flash_attn/modules/embedding.py -> build/lib.linux-x86_64-cpython-310/flash_attn/modules\n", - "running build_ext\n", - "/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:398: UserWarning: There are no x86_64-linux-gnu-g++ version bounds defined for CUDA version 11.8\n", - " warnings.warn(f'There are no {compiler_name} version bounds defined for CUDA version {cuda_str_version}')\n", - "building 'flash_attn_cuda' extension\n", - "creating build/temp.linux-x86_64-cpython-310\n", - "creating build/temp.linux-x86_64-cpython-310/csrc\n", - "creating build/temp.linux-x86_64-cpython-310/csrc/flash_attn\n", - "creating build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src\n", - "x86_64-linux-gnu-gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/cutlass/include -I/usr/local/lib/python3.10/dist-packages/torch/include -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.10/dist-packages/torch/include/TH -I/usr/local/lib/python3.10/dist-packages/torch/include/THC -I/usr/local/cuda/include -I/usr/include/python3.10 -c csrc/flash_attn/fmha_api.cpp -o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/fmha_api.o -O3 -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -DTORCH_EXTENSION_NAME=flash_attn_cuda -D_GLIBCXX_USE_CXX11_ABI=0\n", - "In file included from \u001b[01m\u001b[K/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src/fmha.h:42\u001b[m\u001b[K,\n", - " from \u001b[01m\u001b[Kcsrc/flash_attn/fmha_api.cpp:33\u001b[m\u001b[K:\n", - "\u001b[01m\u001b[K/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src/fmha_utils.h:\u001b[m\u001b[K In function ‘\u001b[01m\u001b[Kvoid set_alpha(uint32_t&, float, Data_type)\u001b[m\u001b[K’:\n", - "\u001b[01m\u001b[K/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src/fmha_utils.h:63:53:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kdereferencing type-punned pointer will break strict-aliasing rules [\u001b[01;35m\u001b[K-Wstrict-aliasing\u001b[m\u001b[K]\n", - " 63 | alpha = reinterpret_cast( \u001b[01;35m\u001b[Kh2\u001b[m\u001b[K );\n", - " | \u001b[01;35m\u001b[K^~\u001b[m\u001b[K\n", - "\u001b[01m\u001b[K/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src/fmha_utils.h:68:53:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kdereferencing type-punned pointer will break strict-aliasing rules [\u001b[01;35m\u001b[K-Wstrict-aliasing\u001b[m\u001b[K]\n", - " 68 | alpha = reinterpret_cast( \u001b[01;35m\u001b[Kh2\u001b[m\u001b[K );\n", - " | \u001b[01;35m\u001b[K^~\u001b[m\u001b[K\n", - "\u001b[01m\u001b[K/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src/fmha_utils.h:70:53:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kdereferencing type-punned pointer will break strict-aliasing rules [\u001b[01;35m\u001b[K-Wstrict-aliasing\u001b[m\u001b[K]\n", - " 70 | alpha = reinterpret_cast( \u001b[01;35m\u001b[Knorm\u001b[m\u001b[K );\n", - " | \u001b[01;35m\u001b[K^~~~\u001b[m\u001b[K\n", - "\u001b[01m\u001b[Kcsrc/flash_attn/fmha_api.cpp:\u001b[m\u001b[K In function ‘\u001b[01m\u001b[Kvoid set_params_fprop(FMHA_fprop_params&, size_t, size_t, size_t, size_t, size_t, at::Tensor, at::Tensor, at::Tensor, at::Tensor, void*, void*, void*, void*, void*, float, float, bool, int)\u001b[m\u001b[K’:\n", - "\u001b[01m\u001b[Kcsrc/flash_attn/fmha_api.cpp:64:38:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[K‘\u001b[01m\u001b[Kvoid* memset(void*, int, size_t)\u001b[m\u001b[K’ clearing an object of non-trivial type ‘\u001b[01m\u001b[Kstruct FMHA_fprop_params\u001b[m\u001b[K’; use assignment or value-initialization instead [\u001b[01;35m\u001b[K-Wclass-memaccess\u001b[m\u001b[K]\n", - " 64 | memset(¶ms, 0, sizeof(params)\u001b[01;35m\u001b[K)\u001b[m\u001b[K;\n", - " | \u001b[01;35m\u001b[K^\u001b[m\u001b[K\n", - "In file included from \u001b[01m\u001b[Kcsrc/flash_attn/fmha_api.cpp:33\u001b[m\u001b[K:\n", - "\u001b[01m\u001b[K/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src/fmha.h:75:8:\u001b[m\u001b[K \u001b[01;36m\u001b[Knote: \u001b[m\u001b[K‘\u001b[01m\u001b[Kstruct FMHA_fprop_params\u001b[m\u001b[K’ declared here\n", - " 75 | struct \u001b[01;36m\u001b[KFMHA_fprop_params\u001b[m\u001b[K : public Qkv_params {\n", - " | \u001b[01;36m\u001b[K^~~~~~~~~~~~~~~~~\u001b[m\u001b[K\n", - "\u001b[01m\u001b[Kcsrc/flash_attn/fmha_api.cpp:60:15:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kunused variable ‘\u001b[01m\u001b[Kacc_type\u001b[m\u001b[K’ [\u001b[01;35m\u001b[K-Wunused-variable\u001b[m\u001b[K]\n", - " 60 | Data_type \u001b[01;35m\u001b[Kacc_type\u001b[m\u001b[K = DATA_TYPE_FP32;\n", - " | \u001b[01;35m\u001b[K^~~~~~~~\u001b[m\u001b[K\n", - "\u001b[01m\u001b[Kcsrc/flash_attn/fmha_api.cpp:\u001b[m\u001b[K In function ‘\u001b[01m\u001b[Kstd::vector mha_fwd(const at::Tensor&, const at::Tensor&, const at::Tensor&, at::Tensor&, const at::Tensor&, const at::Tensor&, int, int, float, float, bool, bool, bool, int, c10::optional)\u001b[m\u001b[K’:\n", - "\u001b[01m\u001b[Kcsrc/flash_attn/fmha_api.cpp:208:10:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kunused variable ‘\u001b[01m\u001b[Kis_sm80\u001b[m\u001b[K’ [\u001b[01;35m\u001b[K-Wunused-variable\u001b[m\u001b[K]\n", - " 208 | bool \u001b[01;35m\u001b[Kis_sm80\u001b[m\u001b[K = dprops->major == 8 && dprops->minor == 0;\n", - " | \u001b[01;35m\u001b[K^~~~~~~\u001b[m\u001b[K\n", - "\u001b[01m\u001b[Kcsrc/flash_attn/fmha_api.cpp:\u001b[m\u001b[K In function ‘\u001b[01m\u001b[Kstd::vector mha_fwd_block(const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, int, int, float, float, bool, bool, c10::optional)\u001b[m\u001b[K’:\n", - "\u001b[01m\u001b[Kcsrc/flash_attn/fmha_api.cpp:533:10:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kunused variable ‘\u001b[01m\u001b[Kis_sm80\u001b[m\u001b[K’ [\u001b[01;35m\u001b[K-Wunused-variable\u001b[m\u001b[K]\n", - " 533 | bool \u001b[01;35m\u001b[Kis_sm80\u001b[m\u001b[K = dprops->major == 8 && dprops->minor == 0;\n", - " | \u001b[01;35m\u001b[K^~~~~~~\u001b[m\u001b[K\n", - "/usr/local/cuda/bin/nvcc -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/cutlass/include -I/usr/local/lib/python3.10/dist-packages/torch/include -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.10/dist-packages/torch/include/TH -I/usr/local/lib/python3.10/dist-packages/torch/include/THC -I/usr/local/cuda/include -I/usr/include/python3.10 -c csrc/flash_attn/src/fmha_block_dgrad_fp16_kernel_loop.sm80.cu -o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_block_dgrad_fp16_kernel_loop.sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options '-fPIC' -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -lineinfo -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 --threads 4 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -DTORCH_EXTENSION_NAME=flash_attn_cuda -D_GLIBCXX_USE_CXX11_ABI=0\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "ptxas info : 25 bytes gmem\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 239 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 239 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 241 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 241 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 242 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 246 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 240 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 238 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 209 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 210 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 230 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 225 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 248 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 244 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 194 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 208 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 218 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 165 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 196 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 197 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 211 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "ptxas info : 25 bytes gmem\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 242 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 248 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 223 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 223 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 241 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 240 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 246 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 194 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 218 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 225 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 176 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 175 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 199 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 210 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 232 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 240 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : 25 bytes gmem\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 253 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 231 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 231 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 242 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 240 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 203 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 215 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 226 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 186 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 187 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 216 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 221 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 234 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z47fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 242 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "/usr/local/cuda/bin/nvcc -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/cutlass/include -I/usr/local/lib/python3.10/dist-packages/torch/include -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.10/dist-packages/torch/include/TH -I/usr/local/lib/python3.10/dist-packages/torch/include/THC -I/usr/local/cuda/include -I/usr/include/python3.10 -c csrc/flash_attn/src/fmha_block_fprop_fp16_kernel.sm80.cu -o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_block_fprop_fp16_kernel.sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options '-fPIC' -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -lineinfo -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 --threads 4 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -DTORCH_EXTENSION_NAME=flash_attn_cuda -D_GLIBCXX_USE_CXX11_ABI=0\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "ptxas info : 25 bytes gmem\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 40 bytes stack frame, 56 bytes spill stores, 64 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 171 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 190 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 187 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 232 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 227 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 235 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 156 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 201 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 199 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 218 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 241 registers\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 228 registers\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "ptxas info : 25 bytes gmem\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 24 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 176 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 190 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 188 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 203 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 222 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 232 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 218 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 236 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 180 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 186 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 207 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 223 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : 25 bytes gmem\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 180 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 198 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 198 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 209 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 220 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 221 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 228 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 173 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 192 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 194 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 204 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 220 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z38fmha_block_fprop_fp16_sm80_loop_kernelI18FMHA_kernel_traitsILi256ELi16ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 225 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "/usr/local/cuda/bin/nvcc -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/cutlass/include -I/usr/local/lib/python3.10/dist-packages/torch/include -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.10/dist-packages/torch/include/TH -I/usr/local/lib/python3.10/dist-packages/torch/include/THC -I/usr/local/cuda/include -I/usr/include/python3.10 -c csrc/flash_attn/src/fmha_bwd_hdim128.cu -o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_bwd_hdim128.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options '-fPIC' -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -lineinfo -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 --threads 4 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -DTORCH_EXTENSION_NAME=flash_attn_cuda -D_GLIBCXX_USE_CXX11_ABI=0\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "ptxas info : 9 bytes gmem\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 248 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 253 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 26 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 248 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 248 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 253 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 248 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "ptxas info : 9 bytes gmem\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 40 bytes spill stores, 44 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 40 bytes spill stores, 44 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 23 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : 7246 bytes gmem\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 32 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 24 bytes spill stores, 44 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 44 bytes spill stores, 56 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 64 bytes stack frame, 80 bytes spill stores, 100 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 72 bytes stack frame, 92 bytes spill stores, 112 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 38 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 40 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 88 bytes stack frame, 116 bytes spill stores, 156 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "/usr/local/cuda/bin/nvcc -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/cutlass/include -I/usr/local/lib/python3.10/dist-packages/torch/include -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.10/dist-packages/torch/include/TH -I/usr/local/lib/python3.10/dist-packages/torch/include/THC -I/usr/local/cuda/include -I/usr/include/python3.10 -c csrc/flash_attn/src/fmha_bwd_hdim32.cu -o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_bwd_hdim32.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options '-fPIC' -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -lineinfo -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 --threads 4 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -DTORCH_EXTENSION_NAME=flash_attn_cuda -D_GLIBCXX_USE_CXX11_ABI=0\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "ptxas info : 17 bytes gmem\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 217 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 215 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 228 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 226 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 244 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 26 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 225 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 226 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 241 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 225 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 236 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 236 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 158 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 156 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 178 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 158 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 179 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 26 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 162 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 152 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 196 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 203 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 205 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 217 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 215 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 228 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 226 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 244 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 225 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 226 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 241 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 225 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 236 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 236 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 158 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 156 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 178 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 158 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 179 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 162 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 152 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 196 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 203 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 205 registers\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "ptxas info : 17 bytes gmem\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 218 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 215 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 231 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 235 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 228 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 230 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 219 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 226 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 238 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 223 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 241 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 244 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 162 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 171 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 182 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 170 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 179 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 185 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 182 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 187 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 186 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 218 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 215 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 231 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 235 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 228 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 230 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 23 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 219 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 226 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 238 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 223 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 241 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 244 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 162 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 171 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 182 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 170 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 179 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 185 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 23 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 182 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 187 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 186 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : 14303 bytes gmem\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 219 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 215 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 231 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 237 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 227 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 228 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 235 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 241 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 32 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 226 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 225 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 239 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 221 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 226 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 235 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 242 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 172 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 186 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 163 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 178 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 188 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 32 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 158 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 181 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 190 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 184 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 219 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 215 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 233 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 243 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 229 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 230 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 237 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 36 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 226 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 235 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 236 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 238 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 239 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 174 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 186 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 174 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 180 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 190 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 36 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 165 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 182 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 190 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 179 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 186 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "/usr/local/cuda/bin/nvcc -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/cutlass/include -I/usr/local/lib/python3.10/dist-packages/torch/include -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.10/dist-packages/torch/include/TH -I/usr/local/lib/python3.10/dist-packages/torch/include/THC -I/usr/local/cuda/include -I/usr/include/python3.10 -c csrc/flash_attn/src/fmha_bwd_hdim64.cu -o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_bwd_hdim64.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options '-fPIC' -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -lineinfo -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 --threads 4 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -DTORCH_EXTENSION_NAME=flash_attn_cuda -D_GLIBCXX_USE_CXX11_ABI=0\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "ptxas info : 17 bytes gmem\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 64 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 44 bytes spill stores, 56 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 120 bytes stack frame, 244 bytes spill stores, 232 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 26 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 64 bytes stack frame, 96 bytes spill stores, 112 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 96 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 16 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 80 bytes stack frame, 180 bytes spill stores, 228 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 144 bytes stack frame, 352 bytes spill stores, 348 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 240 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 26 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 242 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 244 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 243 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 64 bytes spill stores, 72 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 208 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 208 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 204 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 210 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 218 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 26 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 167 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 230 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 232 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 220 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 230 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 240 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 64 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 44 bytes spill stores, 56 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 120 bytes stack frame, 244 bytes spill stores, 232 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 64 bytes stack frame, 96 bytes spill stores, 112 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 96 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 16 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 80 bytes stack frame, 180 bytes spill stores, 228 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 144 bytes stack frame, 352 bytes spill stores, 348 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 250 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 240 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 242 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 244 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 243 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 64 bytes spill stores, 72 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 208 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 208 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 204 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 210 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 218 registers\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 167 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 230 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 232 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 220 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 230 registers\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 240 registers\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "csrc/flash_attn/src/fmha_bwd_launch_template.h(64): warning #177-D: variable \"M\" was declared but never referenced\n", - "\n", - "ptxas info : 17 bytes gmem\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 20 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 48 bytes spill stores, 96 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 128 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 64 bytes stack frame, 100 bytes spill stores, 116 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 104 bytes stack frame, 168 bytes spill stores, 160 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 40 bytes spill stores, 32 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 80 bytes stack frame, 192 bytes spill stores, 240 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 168 bytes stack frame, 396 bytes spill stores, 384 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 247 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 20 bytes spill stores, 28 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 56 bytes stack frame, 80 bytes spill stores, 92 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 165 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 190 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 196 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 198 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 25 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 194 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 196 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 204 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 214 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 222 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 20 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 48 bytes spill stores, 96 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 128 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 23 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 64 bytes stack frame, 100 bytes spill stores, 116 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 104 bytes stack frame, 168 bytes spill stores, 160 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 40 bytes spill stores, 32 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 80 bytes stack frame, 192 bytes spill stores, 240 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 168 bytes stack frame, 396 bytes spill stores, 384 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 249 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 247 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 23 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 252 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 245 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 20 bytes spill stores, 28 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 56 bytes stack frame, 80 bytes spill stores, 92 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 165 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 190 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 196 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 198 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 224 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 23 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 194 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 196 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 204 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 214 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 222 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : 21457 bytes gmem\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 72 bytes stack frame, 120 bytes spill stores, 136 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 56 bytes stack frame, 100 bytes spill stores, 92 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 48 bytes stack frame, 52 bytes spill stores, 56 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 80 bytes stack frame, 236 bytes spill stores, 284 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 136 bytes stack frame, 296 bytes spill stores, 280 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 33 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 28 bytes spill stores, 24 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 80 bytes stack frame, 152 bytes spill stores, 160 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 128 bytes stack frame, 220 bytes spill stores, 204 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 56 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 40 bytes spill stores, 32 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 96 bytes stack frame, 280 bytes spill stores, 328 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 184 bytes stack frame, 492 bytes spill stores, 476 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 33 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 251 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 28 bytes spill stores, 44 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 32 bytes stack frame, 56 bytes spill stores, 96 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 64 bytes stack frame, 96 bytes spill stores, 124 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 175 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 196 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 220 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 234 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfEEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 33 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 214 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 222 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 208 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 220 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E6__halfELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 228 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 56 bytes spill stores, 52 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 80 bytes stack frame, 156 bytes spill stores, 180 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 72 bytes stack frame, 120 bytes spill stores, 112 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 64 bytes stack frame, 80 bytes spill stores, 88 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 52 bytes spill stores, 48 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 112 bytes stack frame, 444 bytes spill stores, 512 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 160 bytes stack frame, 508 bytes spill stores, 492 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 38 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 40 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 88 bytes stack frame, 172 bytes spill stores, 196 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 80 bytes stack frame, 132 bytes spill stores, 124 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 72 bytes stack frame, 84 bytes spill stores, 96 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 48 bytes stack frame, 60 bytes spill stores, 56 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 120 bytes stack frame, 432 bytes spill stores, 488 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 216 bytes stack frame, 592 bytes spill stores, 572 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 20 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 48 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 38 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 20 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 24 bytes stack frame, 32 bytes spill stores, 68 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi8ELj256E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 96 bytes stack frame, 124 bytes spill stores, 148 bytes spill loads\n", - "ptxas info : Used 255 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 175 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 196 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 220 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 204 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 212 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb0ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 236 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z24fmha_bwd_dot_do_o_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 38 registers, 640 bytes cmem[0]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z40fmha_bwd_q_dk_dv_loop_seqparallel_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 200 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 214 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi2EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 222 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 208 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELi1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 210 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb0ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 220 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z29fmha_bwd_dq_dk_dv_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi8ELj8E13__nv_bfloat16ELb1ELb1ELin1EEv17FMHA_dgrad_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 240 registers, 640 bytes cmem[0], 16 bytes cmem[2]\n", - "/usr/local/cuda/bin/nvcc -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/cutlass/include -I/usr/local/lib/python3.10/dist-packages/torch/include -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.10/dist-packages/torch/include/TH -I/usr/local/lib/python3.10/dist-packages/torch/include/THC -I/usr/local/cuda/include -I/usr/include/python3.10 -c csrc/flash_attn/src/fmha_fwd_hdim128.cu -o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_fwd_hdim128.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options '-fPIC' -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -lineinfo -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 --threads 4 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -DTORCH_EXTENSION_NAME=flash_attn_cuda -D_GLIBCXX_USE_CXX11_ABI=0\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "ptxas info : 9 bytes gmem\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "ptxas info : 9 bytes gmem\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : 11186 bytes gmem\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi128ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "/usr/local/cuda/bin/nvcc -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/cutlass/include -I/usr/local/lib/python3.10/dist-packages/torch/include -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.10/dist-packages/torch/include/TH -I/usr/local/lib/python3.10/dist-packages/torch/include/THC -I/usr/local/cuda/include -I/usr/include/python3.10 -c csrc/flash_attn/src/fmha_fwd_hdim32.cu -o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_fwd_hdim32.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options '-fPIC' -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -lineinfo -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 --threads 4 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -DTORCH_EXTENSION_NAME=flash_attn_cuda -D_GLIBCXX_USE_CXX11_ABI=0\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "ptxas info : 17 bytes gmem\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 180 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 194 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 191 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 188 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 218 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 234 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 127 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 132 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 143 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 150 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 133 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 142 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 153 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 158 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 180 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 194 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 191 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 188 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 218 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 234 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 127 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 132 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 143 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 150 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 133 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 142 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 153 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 158 registers\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "ptxas info : 17 bytes gmem\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 178 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 193 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 194 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 232 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 220 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 126 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 128 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 136 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 140 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 125 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 127 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 145 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 145 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 178 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 193 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 202 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 194 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 232 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 220 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 126 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 128 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 136 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 140 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 125 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 127 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 145 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 145 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : 22250 bytes gmem\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 191 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 203 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 198 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 174 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 219 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 219 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 114 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 128 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 128 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 128 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 119 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 128 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 127 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 130 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 178 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 175 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 192 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 203 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 188 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 206 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 219 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 221 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 124 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 129 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 131 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 134 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 126 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 128 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 130 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi32ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 132 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "/usr/local/cuda/bin/nvcc -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/src -I/content/flash-attention/flash-attention/flash-attention/flash-attention/csrc/flash_attn/cutlass/include -I/usr/local/lib/python3.10/dist-packages/torch/include -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.10/dist-packages/torch/include/TH -I/usr/local/lib/python3.10/dist-packages/torch/include/THC -I/usr/local/cuda/include -I/usr/include/python3.10 -c csrc/flash_attn/src/fmha_fwd_hdim64.cu -o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_fwd_hdim64.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options '-fPIC' -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -lineinfo -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 --threads 4 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -DTORCH_EXTENSION_NAME=flash_attn_cuda -D_GLIBCXX_USE_CXX11_ABI=0\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "ptxas info : 17 bytes gmem\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 24 bytes stack frame, 28 bytes spill stores, 24 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 167 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 170 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 177 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 24 bytes stack frame, 28 bytes spill stores, 24 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 167 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 170 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 164 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_90'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 177 registers\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(77): here\n", - "\n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero\n", - " detected during:\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "(61): here\n", - " instantiation of \"__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2327): here\n", - " instantiation of \"__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]\" \n", - "/usr/local/lib/python3.10/dist-packages/torch/include/c10/core/TensorImpl.h(2337): here\n", - "\n", - "ptxas info : 17 bytes gmem\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 24 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 24 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 167 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 172 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 165 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 174 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 167 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 175 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 24 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 24 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 167 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 172 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 165 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 174 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 167 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_80'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 175 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : 22250 bytes gmem\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 254 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 167 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 166 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E6__halfELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 178 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 32 bytes stack frame, 28 bytes spill stores, 36 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi256ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 8 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads\n", - "ptxas info : Used 255 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 184 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 176 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb0ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 171 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb0ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb0EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 176 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "ptxas info : Compiling entry function '_Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params' for 'sm_75'\n", - "ptxas info : Function properties for _Z20fmha_fwd_loop_kernelI18FMHA_kernel_traitsILi128ELi64ELi16ELi1ELi4ELj8E13__nv_bfloat16ELb1ELb1ELb1EEv17FMHA_fprop_params\n", - " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", - "ptxas info : Used 168 registers, 576 bytes cmem[0], 16 bytes cmem[2]\n", - "x86_64-linux-gnu-g++ -pthread -shared -Wl,-O1 -Wl,-Bsymbolic-functions -Wl,-Bsymbolic-functions -g -fwrapv -O2 build/temp.linux-x86_64-cpython-310/csrc/flash_attn/fmha_api.o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_block_dgrad_fp16_kernel_loop.sm80.o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_block_fprop_fp16_kernel.sm80.o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_bwd_hdim128.o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_bwd_hdim32.o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_bwd_hdim64.o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_fwd_hdim128.o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_fwd_hdim32.o build/temp.linux-x86_64-cpython-310/csrc/flash_attn/src/fmha_fwd_hdim64.o -L/usr/local/lib/python3.10/dist-packages/torch/lib -L/usr/local/cuda/lib64 -L/usr/lib/x86_64-linux-gnu -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-cpython-310/flash_attn_cuda.cpython-310-x86_64-linux-gnu.so\n", - "creating build/bdist.linux-x86_64\n", - "creating build/bdist.linux-x86_64/egg\n", - "creating build/bdist.linux-x86_64/egg/flash_attn\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/bert_padding.py -> build/bdist.linux-x86_64/egg/flash_attn\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/flash_blocksparse_attention.py -> build/bdist.linux-x86_64/egg/flash_attn\n", - "creating build/bdist.linux-x86_64/egg/flash_attn/losses\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/losses/__init__.py -> build/bdist.linux-x86_64/egg/flash_attn/losses\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/losses/cross_entropy.py -> build/bdist.linux-x86_64/egg/flash_attn/losses\n", - "creating build/bdist.linux-x86_64/egg/flash_attn/layers\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/layers/__init__.py -> build/bdist.linux-x86_64/egg/flash_attn/layers\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/layers/rotary.py -> build/bdist.linux-x86_64/egg/flash_attn/layers\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/layers/patch_embed.py -> build/bdist.linux-x86_64/egg/flash_attn/layers\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/flash_attn_interface.py -> build/bdist.linux-x86_64/egg/flash_attn\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/flash_blocksparse_attn_interface.py -> build/bdist.linux-x86_64/egg/flash_attn\n", - "creating build/bdist.linux-x86_64/egg/flash_attn/utils\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/utils/distributed.py -> build/bdist.linux-x86_64/egg/flash_attn/utils\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/utils/__init__.py -> build/bdist.linux-x86_64/egg/flash_attn/utils\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/utils/generation.py -> build/bdist.linux-x86_64/egg/flash_attn/utils\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/utils/benchmark.py -> build/bdist.linux-x86_64/egg/flash_attn/utils\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/utils/pretrained.py -> build/bdist.linux-x86_64/egg/flash_attn/utils\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/flash_attn_triton_og.py -> build/bdist.linux-x86_64/egg/flash_attn\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/flash_attention.py -> build/bdist.linux-x86_64/egg/flash_attn\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/__init__.py -> build/bdist.linux-x86_64/egg/flash_attn\n", - "creating build/bdist.linux-x86_64/egg/flash_attn/ops\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/ops/activations.py -> build/bdist.linux-x86_64/egg/flash_attn/ops\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/ops/fused_dense.py -> build/bdist.linux-x86_64/egg/flash_attn/ops\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/ops/__init__.py -> build/bdist.linux-x86_64/egg/flash_attn/ops\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/ops/rms_norm.py -> build/bdist.linux-x86_64/egg/flash_attn/ops\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/ops/layer_norm.py -> build/bdist.linux-x86_64/egg/flash_attn/ops\n", - "creating build/bdist.linux-x86_64/egg/flash_attn/models\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/models/gptj.py -> build/bdist.linux-x86_64/egg/flash_attn/models\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/models/vit.py -> build/bdist.linux-x86_64/egg/flash_attn/models\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/models/bert.py -> build/bdist.linux-x86_64/egg/flash_attn/models\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/models/gpt.py -> build/bdist.linux-x86_64/egg/flash_attn/models\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/models/llama.py -> build/bdist.linux-x86_64/egg/flash_attn/models\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/models/__init__.py -> build/bdist.linux-x86_64/egg/flash_attn/models\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/models/opt.py -> build/bdist.linux-x86_64/egg/flash_attn/models\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/models/gpt_neox.py -> build/bdist.linux-x86_64/egg/flash_attn/models\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/flash_attn_triton.py -> build/bdist.linux-x86_64/egg/flash_attn\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/fused_softmax.py -> build/bdist.linux-x86_64/egg/flash_attn\n", - "creating build/bdist.linux-x86_64/egg/flash_attn/modules\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/modules/mha.py -> build/bdist.linux-x86_64/egg/flash_attn/modules\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/modules/block.py -> build/bdist.linux-x86_64/egg/flash_attn/modules\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/modules/mlp.py -> build/bdist.linux-x86_64/egg/flash_attn/modules\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/modules/__init__.py -> build/bdist.linux-x86_64/egg/flash_attn/modules\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn/modules/embedding.py -> build/bdist.linux-x86_64/egg/flash_attn/modules\n", - "copying build/lib.linux-x86_64-cpython-310/flash_attn_cuda.cpython-310-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/egg\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/bert_padding.py to bert_padding.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/flash_blocksparse_attention.py to flash_blocksparse_attention.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/losses/__init__.py to __init__.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/losses/cross_entropy.py to cross_entropy.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/layers/__init__.py to __init__.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/layers/rotary.py to rotary.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/layers/patch_embed.py to patch_embed.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/flash_attn_interface.py to flash_attn_interface.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/flash_blocksparse_attn_interface.py to flash_blocksparse_attn_interface.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/utils/distributed.py to distributed.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/utils/__init__.py to __init__.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/utils/generation.py to generation.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/utils/benchmark.py to benchmark.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/utils/pretrained.py to pretrained.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/flash_attn_triton_og.py to flash_attn_triton_og.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/flash_attention.py to flash_attention.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/__init__.py to __init__.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/ops/activations.py to activations.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/ops/fused_dense.py to fused_dense.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/ops/__init__.py to __init__.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/ops/rms_norm.py to rms_norm.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/ops/layer_norm.py to layer_norm.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/models/gptj.py to gptj.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/models/vit.py to vit.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/models/bert.py to bert.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/models/gpt.py to gpt.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/models/llama.py to llama.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/models/__init__.py to __init__.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/models/opt.py to opt.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/models/gpt_neox.py to gpt_neox.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/flash_attn_triton.py to flash_attn_triton.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/fused_softmax.py to fused_softmax.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/modules/mha.py to mha.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/modules/block.py to block.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/modules/mlp.py to mlp.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/modules/__init__.py to __init__.cpython-310.pyc\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn/modules/embedding.py to embedding.cpython-310.pyc\n", - "creating stub loader for flash_attn_cuda.cpython-310-x86_64-linux-gnu.so\n", - "byte-compiling build/bdist.linux-x86_64/egg/flash_attn_cuda.py to flash_attn_cuda.cpython-310.pyc\n", - "creating build/bdist.linux-x86_64/egg/EGG-INFO\n", - "copying flash_attn.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO\n", - "copying flash_attn.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", - "copying flash_attn.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", - "copying flash_attn.egg-info/requires.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", - "copying flash_attn.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", - "writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt\n", - "zip_safe flag not set; analyzing archive contents...\n", - "__pycache__.flash_attn_cuda.cpython-310: module references __file__\n", - "creating dist\n", - "creating 'dist/flash_attn-1.0.8-py3.10-linux-x86_64.egg' and adding 'build/bdist.linux-x86_64/egg' to it\n", - "removing 'build/bdist.linux-x86_64/egg' (and everything under it)\n", - "Processing flash_attn-1.0.8-py3.10-linux-x86_64.egg\n", - "removing '/usr/local/lib/python3.10/dist-packages/flash_attn-1.0.8-py3.10-linux-x86_64.egg' (and everything under it)\n", - "creating /usr/local/lib/python3.10/dist-packages/flash_attn-1.0.8-py3.10-linux-x86_64.egg\n", - "Extracting flash_attn-1.0.8-py3.10-linux-x86_64.egg to /usr/local/lib/python3.10/dist-packages\n", - "flash-attn 1.0.8 is already the active version in easy-install.pth\n", - "\n", - "Installed /usr/local/lib/python3.10/dist-packages/flash_attn-1.0.8-py3.10-linux-x86_64.egg\n", - "Processing dependencies for flash-attn==1.0.8\n", - "Searching for ninja==1.11.1\n", - "Best match: ninja 1.11.1\n", - "Processing ninja-1.11.1-py3.10-linux-x86_64.egg\n", - "ninja 1.11.1 is already the active version in easy-install.pth\n", - "Installing ninja script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages/ninja-1.11.1-py3.10-linux-x86_64.egg\n", - "Searching for packaging==23.1\n", - "Best match: packaging 23.1\n", - "Adding packaging 23.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for einops==0.7.0rc1\n", - "Best match: einops 0.7.0rc1\n", - "Processing einops-0.7.0rc1-py3.10.egg\n", - "einops 0.7.0rc1 is already the active version in easy-install.pth\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages/einops-0.7.0rc1-py3.10.egg\n", - "Searching for torch==2.0.1+cu118\n", - "Best match: torch 2.0.1+cu118\n", - "Adding torch 2.0.1+cu118 to easy-install.pth file\n", - "Installing convert-caffe2-to-onnx script to /usr/local/bin\n", - "Installing convert-onnx-to-caffe2 script to /usr/local/bin\n", - "Installing torchrun script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for triton==2.0.0\n", - "Best match: triton 2.0.0\n", - "Adding triton 2.0.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for Jinja2==3.1.2\n", - "Best match: Jinja2 3.1.2\n", - "Adding Jinja2 3.1.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for networkx==3.1\n", - "Best match: networkx 3.1\n", - "Adding networkx 3.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for sympy==1.11.1\n", - "Best match: sympy 1.11.1\n", - "Adding sympy 1.11.1 to easy-install.pth file\n", - "Installing isympy script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for typing-extensions==4.6.3\n", - "Best match: typing-extensions 4.6.3\n", - "Adding typing-extensions 4.6.3 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for filelock==3.12.2\n", - "Best match: filelock 3.12.2\n", - "Adding filelock 3.12.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for lit==16.0.6\n", - "Best match: lit 16.0.6\n", - "Adding lit 16.0.6 to easy-install.pth file\n", - "Installing lit script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for cmake==3.25.2\n", - "Best match: cmake 3.25.2\n", - "Adding cmake 3.25.2 to easy-install.pth file\n", - "Installing cmake script to /usr/local/bin\n", - "Installing cpack script to /usr/local/bin\n", - "Installing ctest script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for MarkupSafe==2.1.3\n", - "Best match: MarkupSafe 2.1.3\n", - "Adding MarkupSafe 2.1.3 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Searching for mpmath==1.3.0\n", - "Best match: mpmath 1.3.0\n", - "Adding mpmath 1.3.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.10/dist-packages\n", - "Finished processing dependencies for flash-attn==1.0.8\n", - "[Errno 2] No such file or directory: '.. # Go back to the parent directory after the installation'\n", - "/content/flash-attention/flash-attention/flash-attention/flash-attention\n" - ] - }, - { - "output_type": "error", - "ename": "ModuleNotFoundError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfunctional\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mflash_attn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflash_attention\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mFlashMHA\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtorchscale\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mXPOS\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRelativePositionBias\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/content/flash-attention/flash_attn/flash_attention.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0meinops\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrearrange\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mflash_attn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflash_attn_interface\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mflash_attn_unpadded_qkvpacked_func\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'einops'", - "", - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n" - ], - "errorDetails": { - "actions": [ - { - "action": "open_url", - "actionText": "Open Examples", - "url": "/notebooks/snippets/importing_libraries.ipynb" - } - ] - } - } - ], - "source": [ - "\n", - "!pip install torch\n", - "!pip install einops\n", - "!pip install torchscale\n", - "\n", - "!git clone https://github.com/HazyResearch/flash-attention.git\n", - "%cd flash-attention\n", - "!python setup.py install\n", - "%cd .. # Go back to the parent directory after the installation\n", - "\n", - "\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "\n", - "from flash_attn.flash_attention import FlashMHA\n", - "\n", - "from torchscale import XPOS, RelativePositionBias\n", - "\n", - "# Replace this with your correct GPU device\n", - "device = \"cuda:0\"\n", - "dtype=torch.float16\n", - "\n", - "#add alibi, qk layer norm, one write head, multihway,\n", - "class DilatedAttention(nn.Module):\n", - " \"\"\"\n", - " Dilated Attention Module.\n", - "\n", - " Arguments:\n", - " d_model: The dimension of the attention layers.\n", - " num_heads: The number of attention heads.\n", - " dilation_rate: The dilation rate for dilated attention.\n", - " segment_size: The segment size for dilated attention.\n", - " dropout (optional): The dropout probability. Default: 0.0\n", - " casual (optional): If set to True, the attention mechanism is casual. Default: False\n", - " use_xpos (optional): If set to True, xpos is used for positional encoding. Default: False\n", - " use_rel_pos_bias (optional): If set to True, relative position bias is used in the attention mechanism. Default: False\n", - "\n", - " Usage:\n", - " The `DilatedAttention` class can be used as a module for neural networks and is especially suited for transformer architectures.\n", - "\n", - " Example:\n", - " attention = DilatedAttention(d_model=512, num_heads=8, dilation_rate=2, segment_size=64, use_xpos=True, use_rel_pos_bias=True)\n", - " output = attention(input_tensor)\n", - "\n", - " This will return the output tensor after applying dilated attention. The `use_xpos` and `use_rel_pos_bias` parameters allow for switching on positional encoding and relative positional bias respectively.\n", - " \"\"\"\n", - " def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0, casual=False, use_xpos=False, use_rel_pos_bias=False):\n", - " super(DilatedAttention, self).__init__()\n", - " self.d_model = d_model\n", - " self.num_heads = num_heads\n", - "\n", - " self.dilation_rate = dilation_rate\n", - " self.segment_size = segment_size\n", - "\n", - " self.attention = FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype)\n", - " self.dropout = nn.Dropout(dropout)\n", - " self.casual = casual\n", - "\n", - " self.use_xpos = use_xpos\n", - " self.use_rel_pos_bias = use_rel_pos_bias\n", - "\n", - " if use_xpos:\n", - " self.xpos = XPOS(head_dim=d_model//num_heads)\n", - " if use_rel_pos_bias:\n", - " self.relative_bias = RelativePositionBias(num_buckets=32, max_distance=128, n_heads=num_heads)\n", - "\n", - " def get_mask(self, i, j):\n", - " return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 2)\n", - "\n", - " def forward(self, x):\n", - " batch_size, seq_len, _ = x.shape\n", - "\n", - " if self.use_xpos:\n", - " x = self.xpos(x)\n", - "\n", - " # Split and sparsify\n", - " x = x.view(batch_size, -1, self.segment_size, self.d_model)\n", - " x = x[:, :, :: self.dilation_rate, :]\n", - "\n", - " # Perform attention\n", - " attn_output, _ = self.attention(x, x, x)\n", - "\n", - " if self.use_rel_pos_bias:\n", - " attn_output += self.relative_bias(batch_size, attn_output.size(1), attn_output.size(1))\n", - "\n", - " # if casual create a mask and apply to the output\n", - " if self.casual:\n", - " mask = self.get_mask(attn_output.size(1), attn_output.size(1))\n", - " attn_output = attn_output.masked_fill(mask, float('-inf'))\n", - "\n", - " # apply dropout\n", - " attn_output = self.dropout(attn_output)\n", - "\n", - " # Scatter and concatenate\n", - " attn_output = attn_output.view(batch_size, -1, self.d_model)\n", - " return attn_output\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6QJNBSLSx-aX" - }, - "outputs": [], - "source": [ - "import time\n", - "import unittest\n", - "import torch\n", - "\n", - "# from LongNet import DilatedAttention, MultiModalDilationAttention\n", - "\n", - "class TestDilatedAttention(unittest.TestCase):\n", - "\n", - " def test_output_shape(self):\n", - " # Setup\n", - " input_tensor = torch.randn(2, 128, 512)\n", - " dilated_attention = DilatedAttention(512, 8, 2, 64)\n", - "\n", - " # Action\n", - " output = dilated_attention(input_tensor)\n", - "\n", - " # Assert\n", - " self.assertEqual(output.shape, (2, 128, 512))\n", - "\n", - " def test_xpos(self):\n", - " # Setup\n", - " input_tensor = torch.randn(2, 128, 512)\n", - " dilated_attention = DilatedAttention(512, 8, 2, 64, use_xpos=True)\n", - "\n", - " # Action\n", - " output = dilated_attention(input_tensor)\n", - "\n", - " # Assert\n", - " self.assertEqual(output.shape, (2, 128, 512))\n", - "\n", - " def test_relative_position_bias(self):\n", - " # Setup\n", - " input_tensor = torch.randn(2, 128, 512)\n", - " dilated_attention = DilatedAttention(512, 8, 2, 64, use_rel_pos_bias=True)\n", - "\n", - " # Action\n", - " output = dilated_attention(input_tensor)\n", - "\n", - " # Assert\n", - " self.assertEqual(output.shape, (2, 128, 512))\n", - "\n", - "\n", - " def test_attention_consistency(self):\n", - " # Setup\n", - " input_tensor = torch.randn(2, 128, 512)\n", - " dilated_attention = DilatedAttention(512, 8, 2, 64)\n", - "\n", - " # Action\n", - " output = dilated_attention(input_tensor)\n", - "\n", - " # Assert\n", - " self.assertTrue((output.std(dim=-1) > 0).all())\n", - "\n", - " def test_speed(self):\n", - " # Setup\n", - " input_tensor = torch.randn(2, 1024, 512)\n", - " dilated_attention = DilatedAttention(512, 8, 2, 64)\n", - "\n", - " # Action\n", - " start_time = time.time()\n", - " output = dilated_attention(input_tensor)\n", - " end_time = time.time()\n", - "\n", - " # Assert\n", - " self.assertLess(end_time - start_time, 1)\n", - "\n", - " def test_gradient_flow(self):\n", - " # Setup\n", - " input_tensor = torch.randn(2, 128, 512, requires_grad=True)\n", - " dilated_attention = DilatedAttention(512, 8, 2, 64)\n", - "\n", - " # Action\n", - " output = dilated_attention(input_tensor)\n", - " output.sum().backward()\n", - " grad_norm = input_tensor.grad.norm().item()\n", - "\n", - " # Assert\n", - " self.assertLess(grad_norm, 1e6)\n", - " self.assertGreater(grad_norm, 1e-6)\n", - "\n", - " def test_scaling(self):\n", - " input_tensor = torch.randn(2, 1024, 512)\n", - " dilated_attention = DilatedAttention(512, 8, 2, 64)\n", - " start_time = time.time()\n", - " _ = dilated_attention(input_tensor)\n", - " time_for_1024 = time.time() - start_time\n", - "\n", - " input_tensor = torch.randn(2, 2048, 512)\n", - " start_time = time.time()\n", - " _ = dilated_attention(input_tensor)\n", - " time_for_2048 = time.time() - start_time\n", - "\n", - " self.assertLessEqual(time_for_2048/time_for_1024, 2)\n", - "\n", - " def test_reproducibility(self):\n", - " torch.manual_seed(0)\n", - " input_tensor = torch.randn(2, 128, 512)\n", - " dilated_attention = DilatedAttention(512, 8, 2, 64)\n", - " output1 = dilated_attention(input_tensor)\n", - "\n", - " torch.manual_seed(0)\n", - " input_tensor = torch.randn(2, 128, 512)\n", - " dilated_attention = DilatedAttention(512, 8, 2, 64)\n", - " output2 = dilated_attention(input_tensor)\n", - "\n", - " self.assertTrue(torch.allclose(output1, output2))\n", - "\n", - " def test_attention_distribution(self):\n", - " input_tensor = torch.randn(2, 128, 512)\n", - " dilated_attention = DilatedAttention(512, 8, 2, 64)\n", - " _, attn_weights = dilated_attention(input_tensor)\n", - "\n", - " self.assertTrue(torch.allclose(attn_weights.sum(dim=-1), torch.tensor(1.)))\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wzPNAUpvzpml" - }, - "outputs": [], - "source": [ - "import timeit\n", - "import torch\n", - "\n", - "#model config\n", - "d_model = 512\n", - "num_heads = 8\n", - "dilation_rate = 2\n", - "segment_size = 64\n", - "\n", - "device = \"cuda:0\"\n", - "dtype=torch.float16\n", - "\n", - "#input data\n", - "batch_size = 32\n", - "seq_len = 1024\n", - "\n", - "\n", - "#create model and data\n", - "model = DilatedAttention(d_model, num_heads, dilation_rate, segment_size).to(device)\n", - "x = torch.randn((batch_size, seq_len, d_model), device=device, dtype=dtype)\n", - "\n", - "\n", - "#test forward pass\n", - "with torch.no_grad():\n", - " output = model(x)\n", - " print(f\"Output shape: {output.shape}\") # expected (batch_size, seq_Len)\n", - "\n", - "\n", - "#benchmark model\n", - "num_runs = 1000\n", - "start_time = timeit.default_timer()\n", - "for _ in range(num_runs):\n", - " model(x)\n", - "\n", - "elapsed_time = timeit.default_timer() - start_time\n", - "print(f\"Average forward pass time: {elapsed_time / num_runs:.6f} seconds\")" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..88963c7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,65 @@ +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "longnet" +version = "0.5.7" +description = "LongNet - Pytorch" +license = "MIT" +authors = ["Kye Gomez "] +homepage = "https://github.com/kyegomez/LongNet" +documentation = "https://github.com/kyegomez/LongNet" # Add this if you have documentation. +readme = "README.md" # Assuming you have a README.md +repository = "https://github.com/kyegomez/LongNet" +keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.6" +] +packages = [ + { include = "long_net" }, + { include = "long_net/**/*.py" }, +] + +[tool.poetry.dependencies] +python = "^3.6" +torch = "*" +einops = "*" +torchscale = "*" +transformers = "*" +accelerate = "*" +bitsandbytes = "*" +dataclasses = "*" +beartype = "*" +zetascale = "*" + + +[tool.poetry.group.lint.dependencies] +ruff = ">=0.0.249,<0.1.8" +types-toml = "^0.10.8.1" +types-redis = "^4.3.21.6" +types-pytz = "^2023.3.0.0" +black = "^23.1.0" +types-chardet = "^5.0.4.6" +mypy-protobuf = "^3.0.0" + + +[tool.autopep8] +max_line_length = 80 +ignore = "E501,W6" # or ["E501", "W6"] +in-place = true +recursive = true +aggressive = 3 + +[tool.ruff] +line-length = 80 + +[tool.black] +line-length = 80 +target-version = ['py38'] +preview = true + diff --git a/requirements.txt b/requirements.txt index b2d6d4e..0763ceb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,9 @@ torch einops - accelerate bitsandbytes fairscale - -timm -ninja packaging - transformers -unittest -timeit \ No newline at end of file +beartype +zetascale \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 4566dc0..0000000 --- a/setup.py +++ /dev/null @@ -1,38 +0,0 @@ -from setuptools import setup, find_packages -# - -setup( - name = 'LongNet', - packages = find_packages(exclude=[]), - version = '0.3.5', - license='MIT', - description = 'LongNet - Pytorch', - author = 'Kye Gomez', - author_email = 'kye@apac.ai', - long_description_content_type = 'text/markdown', - url = 'https://github.com/kyegomez/LongNet', - keywords = [ - 'artificial intelligence', - 'deep learning', - 'optimizers', - "Prompt Engineering" - ], - install_requires=[ - 'torch', - 'einops', - 'torchscale', - 'transformers', - 'accelerate', - 'bitsandbytes', - 'fairscale', - 'timm', - 'dataclasses', - ], - classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.6', - ], -) \ No newline at end of file diff --git a/test/model_test.py b/test/model_test.py deleted file mode 100644 index b625647..0000000 --- a/test/model_test.py +++ /dev/null @@ -1,35 +0,0 @@ -# from LongNet.model import LongNetTokenizer, LongNetSelector -import torch -# from model import LongNetTokenizer, -from LongNet.model import LongNetTokenizer, LongNet - -class LongNetTest: - def __init__(self): - self.longnet_selector = LongNet() - self.tokenizer = LongNetTokenizer() - - def run_test(self, model_type: str): - data = { - 'target_text': ["This is a test sentence."] * 2, - 'image': torch.rand(2, 3, 224, 224) # 2 random images - } - - inputs = self.tokenizer.tokenize(data) - - if model_type.lower() == 'multimodal': - self._test_model('multimodal', inputs) - elif model_type.lower() == 'language': - self._test_model('language', inputs) - else: - raise ValueError(f"Invalid model_type: {model_type}. Please use either 'multimodal' or 'language'.") - - def _test_model(self, model_type: str, inputs: dict): - print(f"Testing {model_type} LongNet model...") - model = self.longnet_selector.get_model(model_type) - outputs = model(**inputs) - print(f"{model_type} LongNet model forward pass succeeded!") - -# # Now you can use the class like this: -# tester = LongNetTest() -# tester.run_test('multimodal') -# tester.run_test('language') diff --git a/test/README.md b/tests/README.md similarity index 100% rename from test/README.md rename to tests/README.md diff --git a/test/attention.py b/tests/attention.py similarity index 70% rename from test/attention.py rename to tests/attention.py index 9caf1b2..8716ff8 100644 --- a/test/attention.py +++ b/tests/attention.py @@ -2,10 +2,10 @@ import unittest import torch -from LongNet import DilatedAttention, MultiModalDilationAttention +from long_net import DilatedAttention -class TestDilatedAttention(unittest.TestCase): +class TestDilatedAttention(unittest.TestCase): def test_output_shape(self): # Setup input_tensor = torch.randn(2, 128, 512) @@ -31,7 +31,9 @@ def test_xpos(self): def test_relative_position_bias(self): # Setup input_tensor = torch.randn(2, 128, 512) - dilated_attention = DilatedAttention(512, 8, 2, 64, use_rel_pos_bias=True) + dilated_attention = DilatedAttention( + 512, 8, 2, 64, use_rel_pos_bias=True + ) # Action output = dilated_attention(input_tensor) @@ -39,7 +41,6 @@ def test_relative_position_bias(self): # Assert self.assertEqual(output.shape, (2, 128, 512)) - def test_attention_consistency(self): # Setup input_tensor = torch.randn(2, 128, 512) @@ -58,7 +59,7 @@ def test_speed(self): # Action start_time = time.time() - output = dilated_attention(input_tensor) + dilated_attention(input_tensor) end_time = time.time() # Assert @@ -84,37 +85,39 @@ def test_scaling(self): start_time = time.time() _ = dilated_attention(input_tensor) time_for_1024 = time.time() - start_time - + input_tensor = torch.randn(2, 2048, 512) start_time = time.time() _ = dilated_attention(input_tensor) time_for_2048 = time.time() - start_time - - self.assertLessEqual(time_for_2048/time_for_1024, 2) - + + self.assertLessEqual(time_for_2048 / time_for_1024, 2) + def test_reproducibility(self): torch.manual_seed(0) input_tensor = torch.randn(2, 128, 512) dilated_attention = DilatedAttention(512, 8, 2, 64) output1 = dilated_attention(input_tensor) - + torch.manual_seed(0) input_tensor = torch.randn(2, 128, 512) dilated_attention = DilatedAttention(512, 8, 2, 64) output2 = dilated_attention(input_tensor) - + self.assertTrue(torch.allclose(output1, output2)) - + def test_attention_distribution(self): input_tensor = torch.randn(2, 128, 512) dilated_attention = DilatedAttention(512, 8, 2, 64) _, attn_weights = dilated_attention(input_tensor) - - self.assertTrue(torch.allclose(attn_weights.sum(dim=-1), torch.tensor(1.))) + + self.assertTrue( + torch.allclose(attn_weights.sum(dim=-1), torch.tensor(1.0)) + ) def setUp(self): - self.d_model = 128 - self.num_heads = 4 + self.dim = 128 + self.heads = 4 self.dilation_rate = 2 self.segment_size = 32 self.dropout = 0.1 @@ -125,13 +128,24 @@ def setUp(self): self.batch_size = 10 self.seq_len = 100 - self.x = torch.rand(self.batch_size, self.seq_len, self.d_model) + self.x = torch.rand(self.batch_size, self.seq_len, self.dim) - self.sparse_dilated_attention = DilatedAttention(self.d_model, self.num_heads, self.dilation_rate, self.segment_size, self.dropout, self.casual, self.use_xpos, self.use_rel_pos_bias) + self.sparse_dilated_attention = DilatedAttention( + self.dim, + self.heads, + self.dilation_rate, + self.segment_size, + self.dropout, + self.casual, + self.use_xpos, + self.use_rel_pos_bias, + ) def test_forward_pass(self): output = self.sparse_dilated_attention(self.x) - self.assertEqual(output.size(), (self.batch_size, self.seq_len, self.d_model)) + self.assertEqual( + output.size(), (self.batch_size, self.seq_len, self.dim) + ) def test_attention_outputs(self): output = self.sparse_dilated_attention(self.x) @@ -142,38 +156,32 @@ def test_dropout(self): self.sparse_dilated_attention.dropout.p = 1.0 output = self.sparse_dilated_attention(self.x) self.assertTrue(torch.all(output == 0)) - - - -class TestMultiModalDilationAttention(unittest.TestCase): +# class TestMultiModalDilationAttention(unittest.TestCase): - def test_output_shape(self): - # Setup - input_tensor = [torch.randn(2, 128, 512), torch.randn(2, 128, 512)] - multi_modal_attention = MultiModalDilationAttention(512, 8, 2, 64, num_modalities=2) +# def test_output_shape(self): +# # Setup +# input_tensor = [torch.randn(2, 128, 512), torch.randn(2, 128, 512)] +# multi_modal_attention = MultiModalDilationAttention(512, 8, 2, 64, num_modalities=2) - # Action - output = multi_modal_attention(input_tensor) +# # Action +# output = multi_modal_attention(input_tensor) - # Assert - self.assertEqual(output.shape, (2, 128, 512)) +# # Assert +# self.assertEqual(output.shape, (2, 128, 512)) - def test_single_modality(self): - # Setup - input_tensor = [torch.randn(2, 128, 512)] - multi_modal_attention = MultiModalDilationAttention(512, 8, 2, 64, num_modalities=1) +# def test_single_modality(self): +# # Setup +# input_tensor = [torch.randn(2, 128, 512)] +# multi_modal_attention = MultiModalDilationAttention(512, 8, 2, 64, num_modalities=1) - # Action - output = multi_modal_attention(input_tensor) +# # Action +# output = multi_modal_attention(input_tensor) - # Assert - self.assertEqual(output.shape, (2, 128, 512)) +# # Assert +# self.assertEqual(output.shape, (2, 128, 512)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() - - - diff --git a/tests/example_old.py b/tests/example_old.py new file mode 100644 index 0000000..d98443d --- /dev/null +++ b/tests/example_old.py @@ -0,0 +1,37 @@ +import timeit +import torch +from long_net.attention import DilatedAttention + +# model config +dim = 512 +heads = 8 +dilation_rate = 2 +segment_size = 64 + +device = "cuda:0" +dtype = torch.float16 + +# input data +batch_size = 32 +seq_len = 1024 + + +# create model and data +model = DilatedAttention(dim, heads, dilation_rate, segment_size).to(device) +x = torch.randn((batch_size, seq_len, dim), device=device, dtype=dtype) + + +# test forward pass +with torch.no_grad(): + output = model(x) + print(f"Output shape: {output.shape}") # expected (batch_size, seq_Len) + + +# benchmark model +num_runs = 1000 +start_time = timeit.default_timer() +for _ in range(num_runs): + model(x) + +elapsed_time = timeit.default_timer() - start_time +print(f"Average forward pass time: {elapsed_time / num_runs:.6f} seconds") diff --git a/tests/flops_test.py b/tests/flops_test.py new file mode 100644 index 0000000..a76a8c8 --- /dev/null +++ b/tests/flops_test.py @@ -0,0 +1,77 @@ +import torch +import time + +from long_net.attention import DilatedAttention + + +# Initialize parameters +bsz = 32 +dim = 512 +heads = 8 +dilation_rate = 2 +segment_size = 512 # You might want to adjust this +dropout = 0.1 +casual = False +use_xpos = False +use_rel_pos_bias = False + +sequence_lengths = list(range(500, 2500, 500)) + +# Device configuration +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +dtype = torch.float32 + +# Initialize model +model = DilatedAttention( + dim=dim, + heads=heads, + dilation_rate=dilation_rate, + segment_size=segment_size, + dropout=dropout, + casual=casual, + use_xpos=use_xpos, + use_rel_pos_bias=use_rel_pos_bias, +).to(device) + +time_taken = [] +tflops_per_s = [] + +# Benchmark model +for seq_len in sequence_lengths: + x = torch.randn(bsz, seq_len, dim).to(device).type(dtype) + torch.cuda.synchronize() + + start = time.time() + output = model(x) + torch.cuda.synchronize() + elapsed = time.time() - start + + time_taken.append(elapsed) + total_flops = 4 * seq_len**2 * (dim // heads) * heads + tflops_per_s.append(total_flops / elapsed / 1e12) # Convert to TFLOPs + +# Print benchmark results +for seq_len, elapsed, tflops in zip(sequence_lengths, time_taken, tflops_per_s): + print( + f"Sequence length: {seq_len}, Time elapsed: {elapsed} s, TFLOPs/s:" + f" {tflops}" + ) + + +# # Plotting +# plt.figure(figsize=(10,4)) + +# plt.subplot(1,2,1) +# plt.plot(sequence_lengths, time_taken) +# plt.title('Time Taken vs Sequence Length') +# plt.xlabel('Sequence Length') +# plt.ylabel('Time Taken (s)') + +# plt.subplot(1,2,2) +# plt.plot(sequence_lengths, tflops_per_s) +# plt.title('Performance vs Sequence Length') +# plt.xlabel('Sequence Length') +# plt.ylabel('Performance (TFLOPs/s)') + +# plt.tight_layout() +# plt.show() diff --git a/tests/model/dilated_model.py b/tests/model/dilated_model.py new file mode 100644 index 0000000..5d6a3ea --- /dev/null +++ b/tests/model/dilated_model.py @@ -0,0 +1,52 @@ +import unittest +import torch +from long_net import DilatedLongNet + + +class TestDilatedLongNet(unittest.TestCase): + def setUp(self): + self.model = DilatedLongNet() + + def test_model_shape(self): + # Test input and output dimensions + x = torch.randint(0, 16000, (4, 1024)) + out = self.model(x) + self.assertEqual(out.shape, (4, 1024, 16000)) + + def test_generate(self): + # Test sample generation + out = self.model.generate(x, temperature=1.0, filter_thres=0.9) + self.assertEqual(out.shape[0], 4) + self.assertEqual(out.shape[1], 1024) + self.assertEqual(out.shape[2], 4) + + def test_dilation(self): + # Test dilated attention + self.assertEqual(self.model.dilation_rate, 1) + self.assertEqual(self.model.segment_size, 0) + self.assertFalse(self.model.casual) + + def test_gradients(self): + # Test backward pass + x = torch.randint(0, 16000, (4, 1024)) + out = self.model(x) + out.backward() + for name, param in self.model.named_parameters(): + self.assertTrue(param.grad is not None) + self.assertFalse(torch.isnan(param.grad).any()) + param.grad.zero_() + + def test_training(self): + # End-to-end training test + optim = torch.optim.Adam(self.model.parameters()) + for _ in range(100): + x = torch.randint(0, 16000, (4, 1024)) + loss = self.model(x).loss + optim.zero_grad() + loss.backward() + optim.step() + self.assertLess(loss.item(), 10) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/model/model.py b/tests/model/model.py new file mode 100644 index 0000000..cf9af06 --- /dev/null +++ b/tests/model/model.py @@ -0,0 +1,29 @@ +import torch +import time +from long_net import DilatedLongNet + +# Instantiate the DilatedLongNet model +model = DilatedLongNet() + +# Define the input tensor +batch_size = 1 +sequence_length = 512 +input_tensor = torch.randn(batch_size, sequence_length).long() + +# Enable CUDA if available +if torch.cuda.is_available(): + model = model.cuda() + input_tensor = input_tensor.cuda() + +# Measure the model forward pass speed +start_time = time.time() +_ = model(input_tensor) +end_time = time.time() + +forward_pass_time = end_time - start_time +print(f"Model Forward Pass Time: {forward_pass_time} seconds") + + +# Count the number of parameters in the model +num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) +print(f"Number of Model Parameters: {num_parameters}") diff --git a/tests/model/model_test.py b/tests/model/model_test.py new file mode 100644 index 0000000..3c691aa --- /dev/null +++ b/tests/model/model_test.py @@ -0,0 +1,41 @@ +# from long_net.model import LongNetTokenizer, LongNetSelector +import torch + +# from model import LongNetTokenizer, +from long_net.model import LongNetTokenizer, LongNet + + +class LongNetTest: + def __init__(self): + self.longnet_selector = LongNet() + self.tokenizer = LongNetTokenizer() + + def run_test(self, model_type: str): + data = { + "target_text": ["This is a test sentence."] * 2, + "image": torch.rand(2, 3, 224, 224), # 2 random images + } + + inputs = self.tokenizer.tokenize(data) + + if model_type.lower() == "multimodal": + self._test_model("multimodal", inputs) + elif model_type.lower() == "language": + self._test_model("language", inputs) + else: + raise ValueError( + f"Invalid model_type: {model_type}. Please use either" + " 'multimodal' or 'language'." + ) + + def _test_model(self, model_type: str, inputs: dict): + print(f"Testing {model_type} LongNet model...") + model = self.longnet_selector.get_model(model_type) + model(**inputs) + print(f"{model_type} LongNet model forward pass succeeded!") + + +# # Now you can use the class like this: +# tester = LongNetTest() +# tester.run_test('multimodal') +# tester.run_test('language') diff --git a/test/test.py b/tests/model/test.py similarity index 85% rename from test/test.py rename to tests/model/test.py index 1e9d704..0b364b8 100644 --- a/test/test.py +++ b/tests/model/test.py @@ -1,7 +1,7 @@ -import torch import unittest from transformers import TrainingArguments, Trainer -from LongNet.model import LongNetTokenizer, LongNet +from long_net.model import LongNetTokenizer, LongNet + class TestLongNetModels(unittest.TestCase): def setUp(self): @@ -14,7 +14,7 @@ def setUp(self): save_steps=10_000, save_total_limit=2, logging_steps=500, - logging_dir='./test_logs', + logging_dir="./test_logs", ) def test_language_model(self): @@ -29,11 +29,12 @@ def test_language_model(self): def get_sample_dataset(self): # generate a simple dataset for testing - data = {'target_text': ["This is a test sentence."] * 10} + data = {"target_text": ["This is a test sentence."] * 10} # Tokenize dataset inputs = self.tokenizer.tokenize(data) return inputs + if __name__ == "__main__": unittest.main() diff --git a/tests/speed_sequence.py b/tests/speed_sequence.py new file mode 100644 index 0000000..d5d64e2 --- /dev/null +++ b/tests/speed_sequence.py @@ -0,0 +1,66 @@ +import torch +import time + +from long_net.attention import DilatedAttention +import matplotlib.pyplot as plt + + +# Define sequence lengths to test +seq_lengths = [64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 64000] + +# Define batch size and feature dimension +batch_size = 32 +dim = 512 + +device = "cuda:0" + +# Initialize DilatedAttentionold module +attention = DilatedAttention( + dim=dim, + heads=8, + dilation_rate=2, + segment_size=64, + use_xpos=False, + use_rel_pos_bias=False, +) + +# Move the model to GPU +attention.to(device) + +# Prepare a list to store times +times = [] + +# Benchmark each sequence length +for seq_len in seq_lengths: + # Create a random input tensor + x = torch.randn(batch_size, seq_len, dim).to(device) + + # Warm up GPU + for _ in range(10): + _ = attention(x) + + # Benchmark + start_time = time.time() + for _ in range(100): + _ = attention(x) + end_time = time.time() + + # Calculate average forward pass time + avg_time = (end_time - start_time) / 100 + + # Store the time + times.append(avg_time) + + print( + f"Sequence length: {seq_len}, Average forward pass time:" + f" {avg_time} seconds" + ) + +# Plot the results +plt.figure(figsize=(10, 6)) +plt.plot(seq_lengths, times, marker="o") +plt.title("Average forward pass time for different sequence lengths") +plt.xlabel("Sequence length") +plt.ylabel("Average forward pass time (seconds)") +plt.grid(True) +plt.show() diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..a83783c --- /dev/null +++ b/tests/test.py @@ -0,0 +1,38 @@ +import timeit +import torch +from long_net import DilatedAttention + + +# model config +dim = 512 +heads = 8 +dilation_rate = 2 +segment_size = 64 + +device = "cuda:0" +dtype = torch.float16 + +# input data +batch_size = 32 +seq_len = 1024 + + +# create model and data +model = DilatedAttention(dim, heads, dilation_rate, segment_size).to(device) +x = torch.randn((batch_size, seq_len, dim), device=device, dtype=dtype) + + +# test forward pass +with torch.no_grad(): + output = model(x) + print(f"Output shape: {output.shape}") # expected (batch_size, seq_Len) + + +# benchmark model +num_runs = 1000 +start_time = timeit.default_timer() +for _ in range(num_runs): + model(x) + +elapsed_time = timeit.default_timer() - start_time +print(f"Average forward pass time: {elapsed_time / num_runs:.6f} seconds") diff --git a/tests/test_attention.py b/tests/test_attention.py new file mode 100644 index 0000000..35915e8 --- /dev/null +++ b/tests/test_attention.py @@ -0,0 +1,101 @@ +import pytest +import torch +from long_net import DilatedAttention + + +@pytest.fixture +def dilated_attention(): + return DilatedAttention( + dim=512, + heads=8, + dilation_rate=2, + segment_size=64, + dropout=0.1, + causal=False, + use_xpos=True, + use_rel_pos_bias=True, + qk_norm=False, + dtype=torch.float32, + device="cpu", + ) + + +@pytest.mark.parametrize( + "input_shape", + [ + (1, 10, 512), + (2, 20, 512), + (5, 50, 512), + (10, 100, 512), + (20, 200, 512), + ], +) +def test_forward_shape(dilated_attention, input_shape): + x = torch.rand(input_shape) + output = dilated_attention(x) + assert ( + output.shape == input_shape + ), f"Expected output shape {input_shape}, but got {output.shape}" + + +@pytest.mark.parametrize( + "input_val", + [ + torch.zeros((1, 10, 512)), + torch.ones((1, 10, 512)), + torch.full((1, 10, 512), fill_value=0.5), + ], +) +def test_forward_values(dilated_attention, input_val): + output = dilated_attention(input_val) + assert not torch.isnan(output).any(), "Output contains NaN values" + assert not torch.isinf(output).any(), "Output contains infinite values" + + +@pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64]) +def test_forward_dtype(dilated_attention, input_dtype): + x = torch.rand((1, 10, 512), dtype=input_dtype) + output = dilated_attention(x) + assert ( + output.dtype == input_dtype + ), f"Expected output dtype {input_dtype}, but got {output.dtype}" + + +# Add more tests as needed +@pytest.mark.parametrize("causal", [True, False]) +def test_forward_causal(dilated_attention, causal): + dilated_attention.causal = causal + x = torch.rand((1, 10, 512)) + output = dilated_attention(x) + assert output is not None, "Output is None" + + +@pytest.mark.parametrize("use_xpos", [True, False]) +def test_forward_use_xpos(dilated_attention, use_xpos): + dilated_attention.use_xpos = use_xpos + x = torch.rand((1, 10, 512)) + output = dilated_attention(x) + assert output is not None, "Output is None" + + +@pytest.mark.parametrize("use_rel_pos_bias", [True, False]) +def test_forward_use_rel_pos_bias(dilated_attention, use_rel_pos_bias): + dilated_attention.use_rel_pos_bias = use_rel_pos_bias + x = torch.rand((1, 10, 512)) + output = dilated_attention(x) + assert output is not None, "Output is None" + + +@pytest.mark.parametrize("qk_norm", [True, False]) +def test_forward_qk_norm(dilated_attention, qk_norm): + dilated_attention.qk_norm = qk_norm + x = torch.rand((1, 10, 512)) + output = dilated_attention(x) + assert output is not None, "Output is None" + + +def test_forward_with_mask(dilated_attention): + x = torch.rand((1, 10, 512)) + mask = torch.ones((1, 10, 10)) + output = dilated_attention(x, mask=mask) + assert output is not None, "Output is None" diff --git a/train.py b/train.py new file mode 100644 index 0000000..97a4edc --- /dev/null +++ b/train.py @@ -0,0 +1,106 @@ +import gzip +import random + +import numpy as np +import torch +import torch.optim as optim +import tqdm +from torch.utils.data import DataLoader, Dataset + +from long_net.model import LongNetTransformer, AutoregressiveWrapper +from long_net.utils import StableAdamWUnfused + + +# constants +NUM_BATCHES = int(1e5) +BATCH_SIZE = 4 +GRADIENT_ACCUMULATE_EVERY = 4 +LEARNING_RATE = 2e-4 +VALIDATE_EVERY = 100 +GENERATE_EVERY = 500 +GENERATE_LENGTH = 512 +SEQ_LEN = 8196 + + +# helpers +def cycle(loader): + while True: + for data in loader: + yield data + + +def decode_token(token): + return str(chr(max(32, token))) + + +def decode_tokens(tokens): + return "".join(list(map(decode_token, tokens))) + + +# instantiate GPT-like decoder model +model = LongNetTransformer(num_tokens=256, dim=512, depth=8) + +model = AutoregressiveWrapper(model, max_seq_len=SEQ_LEN) +# model.cuda() + +# prepare enwik8 data + +with gzip.open("./data/enwik8.gz") as file: + X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) + trX, vaX = np.split(X, [int(90e6)]) + data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX) + + +class TextSamplerDataset(Dataset): + def __init__(self, data, seq_len): + super().__init__() + self.data = data + self.seq_len = seq_len + + def __getitem__(self, index): + rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,)) + full_seq = self.data[rand_start : rand_start + self.seq_len + 1].long() + return full_seq # .cuda() + + def __len__(self): + return self.data.size(0) // self.seq_len + + +train_dataset = TextSamplerDataset(data_train, SEQ_LEN) +val_dataset = TextSamplerDataset(data_val, SEQ_LEN) +train_loader = cycle(DataLoader(train_dataset, batch_size=BATCH_SIZE)) +val_loader = cycle(DataLoader(val_dataset, batch_size=BATCH_SIZE)) + +# optimizer + +optim = StableAdamWUnfused(model.parameters(), lr=LEARNING_RATE) + +# training + +for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"): + model.train() + + for __ in range(GRADIENT_ACCUMULATE_EVERY): + loss = model(next(train_loader)) + loss.backward() + + print(f"training loss: {loss.item()}") + torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) + optim.step() + optim.zero_grad() + + if i % VALIDATE_EVERY == 0: + model.eval() + with torch.no_grad(): + loss = model(next(val_loader)) + print(f"validation loss: {loss.item()}") + + if i % GENERATE_EVERY == 0: + model.eval() + inp = random.choice(val_dataset)[:-1] + prime = decode_tokens(inp) + print("%s \n\n %s", (prime, "*" * 100)) + + sample = model.generate(inp[None, ...], GENERATE_LENGTH) + output_str = decode_tokens(sample[0]) + print(output_str)