Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Hotfix] Fix accuracy and align attention method api with Triton kernel #5229

Merged
merged 7 commits into from
Jan 8, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
alignment in attention
  • Loading branch information
CjhHa1 committed Jan 5, 2024

Unverified

This user has not yet uploaded their public signing key.
commit 535aec54369411221c7b3db0bb6dca459053c6b7
98 changes: 42 additions & 56 deletions colossalai/inference/modeling/layers/attention.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import math
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb


def copy_to_cache(source, cache, lengths, block_tables, type: str = "prefill"):
@@ -71,23 +69,14 @@ def convert_kvcache(cache, lengths, block_tables):
return torch.stack(padded_cache, dim=0)


class PagedAttention(nn.Module):
class PagedAttention:
"""
Pure Torch implementation version of paged_attention.
Holds different types of forward function and useful components.
"""

def __init__(self, num_heads: int, head_size: int, scale: float = 1.0, sliding_window: Optional[int] = None):
super().__init__()
self.num_heads = num_heads
self.head_size = head_size
self.scale = float(scale)
self.sliding_window = sliding_window
self._init_rope()

def _init_rope(self):
self.rotary_emb = LlamaRotaryEmbedding(self.head_size)

def pad_and_reshape(self, tensor, seq_lengths, max_seq_len, num_heads, head_size):
@staticmethod
def pad_and_reshape(tensor, seq_lengths, max_seq_len, num_heads, head_size):
bsz = len(seq_lengths)
padded_tensor = torch.zeros(bsz, max_seq_len, num_heads, head_size)

@@ -98,13 +87,14 @@ def pad_and_reshape(self, tensor, seq_lengths, max_seq_len, num_heads, head_size
token_idx += seq_len
return padded_tensor

def generate_padding_mask(self, lengths, max_seq_len):
@staticmethod
def generate_padding_mask(lengths, max_seq_len):
range_tensor = torch.arange(max_seq_len).expand(len(lengths), max_seq_len)
padding_mask = range_tensor < lengths.unsqueeze(1)
return padding_mask

@staticmethod
def nopad_context_forward(
self,
q: torch.Tensor, # [num_tokens, num_heads, head_size]
k: torch.Tensor,
v: torch.Tensor,
@@ -113,6 +103,10 @@ def nopad_context_forward(
context_lengths: torch.Tensor, # [num_seqs]
block_tables: torch.Tensor, # [num_seqs,max_blocks_per_sequence]
):
"""
NOTE: q,k,v are projected and applied rotary embedding, all aligned with triton version.
"""
# Fisrt, do shape verification
num_tokens, num_heads, head_size = q.shape
block_size = k_cache.shape[-1]
bsz, max_blocks_per_sequence = block_tables.shape
@@ -122,42 +116,39 @@ def nopad_context_forward(
assert context_lengths.shape[0] == block_tables.shape[0]
shape = (bsz, max_seq_len, num_heads, head_size)
input_shape = shape[:2]
query = self.pad_and_reshape(q, context_lengths, max_seq_len, num_heads, head_size).transpose(1, 2)
key = self.pad_and_reshape(k, context_lengths, max_seq_len, num_heads, head_size).transpose(1, 2)
value = self.pad_and_reshape(v, context_lengths, max_seq_len, num_heads, head_size).transpose(1, 2)

attn_mask = AttentionMaskConverter._make_causal_mask(input_shape, q.dtype, q.device, past_key_values_length=0)
self.generate_padding_mask(context_lengths, max_seq_len)

position_ids = torch.arange(0, max_seq_len, dtype=torch.long, device=query.device)
position_ids = position_ids.unsqueeze(0)
q = PagedAttention.pad_and_reshape(q, context_lengths, max_seq_len, num_heads, head_size).transpose(1, 2)
k = PagedAttention.pad_and_reshape(k, context_lengths, max_seq_len, num_heads, head_size).transpose(1, 2)
v = PagedAttention.pad_and_reshape(v, context_lengths, max_seq_len, num_heads, head_size).transpose(1, 2)

cos, sin = self.rotary_emb(value, max_seq_len)
query, key = apply_rotary_pos_emb(query, key, cos, sin, position_ids)
copy_to_cache(k.transpose(1, 2), k_cache, lengths=context_lengths, block_tables=block_tables)
copy_to_cache(v.transpose(1, 2), v_cache, lengths=context_lengths, block_tables=block_tables)

copy_to_cache(key.transpose(1, 2), k_cache, lengths=context_lengths, block_tables=block_tables)
copy_to_cache(value.transpose(1, 2), v_cache, lengths=context_lengths, block_tables=block_tables)
attn_mask = AttentionMaskConverter._make_causal_mask(input_shape, q.dtype, q.device, past_key_values_length=0)
attn_mask += PagedAttention.generate_padding_mask(context_lengths, max_seq_len)

attn_weights = torch.matmul(query, key.transpose(2, 3)) / math.sqrt(head_size)
# position_ids = torch.arange(0, max_seq_len, dtype=torch.long, device=query.device)
# position_ids = position_ids.unsqueeze(0)
# cos, sin = self.rotary_emb(value, max_seq_len)
# query, key = apply_rotary_pos_emb(query, key, cos, sin, position_ids)

attn_weights = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(head_size)
if attn_weights.size() != (bsz, num_heads, max_seq_len, max_seq_len):
raise ValueError(f"Got wrong attn_weights, should be in shape {(bsz,num_heads,max_seq_len,max_seq_len)}.")

if attn_mask is not None:
attn_weights += attn_mask

attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
# attn_weights = nn.functional.dropout(attn_weights,p=self.attention_dropout,training=False) maybe useless
attn_output = torch.matmul(attn_weights, value)
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
attn_output = torch.matmul(attn_weights, v)

if attn_output.size() != (bsz, num_heads, max_seq_len, head_size):
raise ValueError(f"Got wrong attn_output, should be in shape {(bsz,num_heads,max_seq_len,head_size)}.")
attn_output = attn_output.transpose(1, 2).contiguous().reshape(bsz, max_seq_len, -1)

return attn_output

@staticmethod
def pad_context_forward(
self,
q: torch.Tensor, # [batch_size, seq_len, num_heads, head_size]
k: torch.Tensor,
v: torch.Tensor,
@@ -166,36 +157,32 @@ def pad_context_forward(
context_lengths: torch.Tensor, # [num_seqs]
block_tables: torch.Tensor, # [num_seqs,max_blocks_per_sequence]
):
# Firt, do shape verification
bsz, seq_len, num_heads, head_size = q.shape
block_size = k_cache.shape[-1]
assert q.shape[0] == k.shape[0] == v.shape[0] == block_tables.shape[0]
block_tables.shape[-1] * block_size
shape = (bsz, seq_len, num_heads, head_size)
input_shape = shape[:2]

# Copy kv to memory(rotary embedded)
copy_to_cache(k, k_cache, lengths=context_lengths, block_tables=block_tables)
copy_to_cache(v, v_cache, lengths=context_lengths, block_tables=block_tables)

q = q.transpose(1, 2)
k = k.transpose(1, 2)
v = v.transpose(1, 2)

position_ids = torch.arange(0, seq_len, dtype=torch.long, device=q.device)
position_ids = position_ids.unsqueeze(0)
cos, sin = self.rotary_emb(v, seq_len)
query, key = apply_rotary_pos_emb(q, k, cos, sin, position_ids)

copy_to_cache(key.transpose(1, 2), k_cache, lengths=context_lengths, block_tables=block_tables)
copy_to_cache(v.transpose(1, 2), v_cache, lengths=context_lengths, block_tables=block_tables)

attn_weights = torch.matmul(query, key.transpose(2, 3)) / math.sqrt(head_size)
attn_weights = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(head_size)
attn_mask = AttentionMaskConverter._make_causal_mask(input_shape, q.dtype, q.device, past_key_values_length=0)
self.generate_padding_mask(context_lengths, seq_len)
attn_mask += PagedAttention.generate_padding_mask(context_lengths, seq_len)

if attn_weights.size() != (bsz, num_heads, seq_len, seq_len):
raise ValueError(f"Got wrong attn_weights, should be in shape {(bsz,num_heads,seq_len,seq_len)}.")
if attn_mask is not None:
attn_weights += attn_mask

attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)

# attn_weights = nn.functional.dropout(attn_weights,p=self.attention_dropout,training=False) maybe useless
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
attn_output = torch.matmul(attn_weights, v)

if attn_output.size() != (bsz, num_heads, seq_len, head_size):
@@ -205,8 +192,8 @@ def pad_context_forward(

return attn_output

@staticmethod
def pad_decoding_forward(
self,
q: torch.Tensor, # [bsz, 1, num_heads, head_size]
k: torch.Tensor,
v: torch.Tensor,
@@ -224,13 +211,12 @@ def pad_decoding_forward(
attn_mask = AttentionMaskConverter._make_causal_mask(
q.shape[:2], q.dtype, q.device, past_key_values_length=seq_len - 1
)
self.generate_padding_mask(lengths, max_seq_len)
cos, sin = self.rotary_emb(v, max_seq_len)

position_ids = lengths - 1
position_ids = position_ids.unsqueeze(1)
PagedAttention.generate_padding_mask(lengths, max_seq_len)

query, key = apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=2)
# cos, sin = self.rotary_emb(v, max_seq_len)
# position_ids = lengths - 1
# position_ids = position_ids.unsqueeze(1)
# query, key = apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=2)

copy_to_cache(key, k_cache, lengths=lengths, block_tables=block_tables, type="decoding")
copy_to_cache(v, v_cache, lengths=lengths, block_tables=block_tables, type="decoding")
@@ -250,7 +236,6 @@ def pad_decoding_forward(
attn_weights += attn_mask

attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
# attn_weights = nn.functional.dropout(attn_weights,p=self.attention_dropout,training=False) maybe useless
attn_output = torch.matmul(attn_weights, value)

if attn_output.size() != (bsz, num_heads, 1, head_size):
@@ -259,6 +244,7 @@ def pad_decoding_forward(

return attn_output

@staticmethod
def no_pad_decoding_forward(
self,
q: torch.Tensor, # [num_tokens, num_heads, head_size]