diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 2263277b68..355a8d8466 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -376,7 +376,7 @@ jobs: run: | source ./ov/setupvars.sh ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt - ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -420,7 +420,7 @@ jobs: A:' > ./prompt.txt ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt - ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index 51ac654aac..dc70d25c2c 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -1,9 +1,9 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include #include #include +#include namespace { @@ -58,7 +58,7 @@ struct TextStreamer { void end() { std::string text = detokenize(detokenizer, token_cache); if (text.size() <= print_len) - return ; + return; std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; token_cache.clear(); print_len = 0; @@ -75,10 +75,7 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ auto old_tensor_data = tensor.data(); auto shape = tensor.get_shape(); - size_t batch_size = shape[0]; - size_t num_kv_heads = shape[1]; - size_t old_seq_len = shape[2]; - size_t head_size = shape[3]; + size_t old_seq_len = shape[seq_len_axis]; OPENVINO_ASSERT(new_seq_len <= old_seq_len); @@ -86,14 +83,16 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ if (old_seq_len == new_seq_len) return tensor; + shape[seq_len_axis] = new_seq_len; + if (seq_len_axis == 0) { - shape[0] = new_seq_len; tensor.set_shape(shape); return tensor; } ov::Coordinate new_shape_begin{0, 0, 0, 0}; - ov::Coordinate new_shape_end{batch_size, num_kv_heads, new_seq_len, head_size}; + ov::Coordinate new_shape_end{shape}; + auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end); return new_tensor; diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index 4927b7d795..ba610574e8 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -6,13 +6,13 @@ #include #include -constexpr size_t BATCH_SIZE = 1; +namespace { +constexpr size_t BATCH_SIZE = 1; // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], // threfore usually SEQ_LEN_AXIS = 2 constexpr size_t SEQ_LEN_AXIS = 2; -namespace { std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); tokenizer.infer(); @@ -58,7 +58,7 @@ struct TextStreamer { void end() { std::string text = detokenize(detokenizer, token_cache); if (text.size() <= print_len) - return ; + return; std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; token_cache.clear(); print_len = 0; @@ -75,10 +75,7 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ auto old_tensor_data = tensor.data(); auto shape = tensor.get_shape(); - size_t batch_size = shape[0]; - size_t num_kv_heads = shape[1]; - size_t old_seq_len = shape[2]; - size_t head_size = shape[3]; + size_t old_seq_len = shape[seq_len_axis]; OPENVINO_ASSERT(new_seq_len <= old_seq_len); @@ -86,14 +83,16 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ if (old_seq_len == new_seq_len) return tensor; + shape[seq_len_axis] = new_seq_len; + if (seq_len_axis == 0) { - shape[0] = new_seq_len; tensor.set_shape(shape); return tensor; } ov::Coordinate new_shape_begin{0, 0, 0, 0}; - ov::Coordinate new_shape_end{batch_size, num_kv_heads, new_seq_len, head_size}; + ov::Coordinate new_shape_end{shape}; + auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end); return new_tensor;