Skip to content

Commit 83fad15

Browse files
authored
feat: Entity key deserialization (feast-dev#4284)
* Add new version of serialization and desrialization Signed-off-by: cmuhao <sduxuhao@gmail.com> * Add new version of serialization and desrialization Signed-off-by: cmuhao <sduxuhao@gmail.com> * fix test Signed-off-by: cmuhao <sduxuhao@gmail.com> * fix test Signed-off-by: cmuhao <sduxuhao@gmail.com> * add test Signed-off-by: cmuhao <sduxuhao@gmail.com> * add test Signed-off-by: cmuhao <sduxuhao@gmail.com> * update doc Signed-off-by: cmuhao <sduxuhao@gmail.com> --------- Signed-off-by: cmuhao <sduxuhao@gmail.com>
1 parent df46cae commit 83fad15

File tree

3 files changed

+154
-5
lines changed

3 files changed

+154
-5
lines changed

sdk/python/feast/infra/key_encoding_utils.py

+79-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,23 @@ def _serialize_val(
2020
return struct.pack("<l", v.int64_val), ValueType.INT64
2121
return struct.pack("<q", v.int64_val), ValueType.INT64
2222
else:
23-
raise ValueError(f"Value type not supported for Firestore: {v}")
23+
raise ValueError(f"Value type not supported for feast feature store: {v}")
24+
25+
26+
def _deserialize_value(value_type, value_bytes) -> ValueProto:
27+
if value_type == ValueType.INT64:
28+
value = struct.unpack("<q", value_bytes)[0]
29+
return ValueProto(int64_val=value)
30+
if value_type == ValueType.INT32:
31+
value = struct.unpack("<i", value_bytes)[0]
32+
return ValueProto(int32_val=value)
33+
elif value_type == ValueType.STRING:
34+
value = value_bytes.decode("utf-8")
35+
return ValueProto(string_val=value)
36+
elif value_type == ValueType.BYTES:
37+
return ValueProto(bytes_val=value_bytes)
38+
else:
39+
raise ValueError(f"Unsupported value type: {value_type}")
2440

2541

2642
def serialize_entity_key_prefix(entity_keys: List[str]) -> bytes:
@@ -50,6 +66,15 @@ def serialize_entity_key(
5066
serialize to the same byte string[1].
5167
5268
[1] https://developers.google.com/protocol-buffers/docs/encoding
69+
70+
Args:
71+
entity_key_serialization_version: version of the entity key serialization
72+
version 1: int64 values are serialized as 4 bytes
73+
version 2: int64 values are serialized as 8 bytes
74+
version 3: entity_key size is added to the serialization for deserialization purposes
75+
entity_key: EntityKeyProto
76+
77+
Returns: bytes of the serialized entity key
5378
"""
5479
sorted_keys, sorted_values = zip(
5580
*sorted(zip(entity_key.join_keys, entity_key.entity_values))
@@ -58,6 +83,8 @@ def serialize_entity_key(
5883
output: List[bytes] = []
5984
for k in sorted_keys:
6085
output.append(struct.pack("<I", ValueType.STRING))
86+
if entity_key_serialization_version > 2:
87+
output.append(struct.pack("<I", len(k)))
6188
output.append(k.encode("utf8"))
6289
for v in sorted_values:
6390
val_bytes, value_type = _serialize_val(
@@ -74,6 +101,57 @@ def serialize_entity_key(
74101
return b"".join(output)
75102

76103

104+
def deserialize_entity_key(
105+
serialized_entity_key: bytes, entity_key_serialization_version=3
106+
) -> EntityKeyProto:
107+
"""
108+
Deserialize entity key from a bytestring. This function can only be used with entity_key_serialization_version > 2.
109+
Args:
110+
entity_key_serialization_version: version of the entity key serialization
111+
serialized_entity_key: serialized entity key bytes
112+
113+
Returns: EntityKeyProto
114+
115+
"""
116+
if entity_key_serialization_version <= 2:
117+
raise ValueError(
118+
"Deserialization of entity key with version <= 2 is not supported. Please use version > 2 by setting entity_key_serialization_version=3"
119+
)
120+
offset = 0
121+
keys = []
122+
values = []
123+
while offset < len(serialized_entity_key):
124+
key_type = struct.unpack_from("<I", serialized_entity_key, offset)[0]
125+
offset += 4
126+
127+
# Read the length of the key
128+
key_length = struct.unpack_from("<I", serialized_entity_key, offset)[0]
129+
offset += 4
130+
131+
if key_type == ValueType.STRING:
132+
key = struct.unpack_from(f"<{key_length}s", serialized_entity_key, offset)[
133+
0
134+
]
135+
keys.append(key.decode("utf-8").rstrip("\x00"))
136+
offset += key_length
137+
else:
138+
raise ValueError(f"Unsupported key type: {key_type}")
139+
140+
(value_type,) = struct.unpack_from("<I", serialized_entity_key, offset)
141+
offset += 4
142+
143+
(value_length,) = struct.unpack_from("<I", serialized_entity_key, offset)
144+
offset += 4
145+
146+
# Read the value based on its type and length
147+
value_bytes = serialized_entity_key[offset : offset + value_length]
148+
value = _deserialize_value(value_type, value_bytes)
149+
values.append(value)
150+
offset += value_length
151+
152+
return EntityKeyProto(join_keys=keys, entity_values=values)
153+
154+
77155
def get_list_val_str(val):
78156
accept_value_types = [
79157
"float_list_val",

sdk/python/feast/repo_config.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -175,10 +175,12 @@ class RepoConfig(FeastBaseModel):
175175
used when writing data to the online store.
176176
A value <= 1 uses the serialization scheme used by feast up to Feast 0.22.
177177
A value of 2 uses a newer serialization scheme, supported as of Feast 0.23.
178-
The main difference between the two scheme is that the serialization scheme v1 stored `long` values as `int`s,
179-
which would result in errors trying to serialize a range of values.
180-
v2 fixes this error, but v1 is kept around to ensure backwards compatibility - specifically the ability to read
178+
A value of 3 uses the latest serialization scheme, supported as of Feast 0.38.
179+
The main difference between the three schema is that
180+
v1: the serialization scheme v1 stored `long` values as `int`s, which would result in errors trying to serialize a range of values.
181+
v2: fixes this error, but v1 is kept around to ensure backwards compatibility - specifically the ability to read
181182
feature values for entities that have already been written into the online store.
183+
v3: add entity_key value length to serialized bytes to enable deserialization, which can be used in retrieval of entity_key in document retrieval.
182184
"""
183185

184186
coerce_tz_aware: Optional[bool] = True

sdk/python/tests/unit/infra/test_key_encoding_utils.py

+70-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
import pytest
22

3-
from feast.infra.key_encoding_utils import serialize_entity_key
3+
from feast.infra.key_encoding_utils import (
4+
_deserialize_value,
5+
_serialize_val,
6+
deserialize_entity_key,
7+
serialize_entity_key,
8+
)
49
from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto
510
from feast.protos.feast.types.Value_pb2 import Value as ValueProto
11+
from feast.protos.feast.types.Value_pb2 import ValueType
612

713

814
def test_serialize_entity_key():
@@ -28,3 +34,66 @@ def test_serialize_entity_key():
2834
join_keys=["user"], entity_values=[ValueProto(int64_val=int(2**31))]
2935
),
3036
)
37+
38+
39+
def test_deserialize_entity_key():
40+
serialized_entity_key = serialize_entity_key(
41+
EntityKeyProto(
42+
join_keys=["user"], entity_values=[ValueProto(int64_val=int(2**15))]
43+
),
44+
entity_key_serialization_version=3,
45+
)
46+
47+
deserialized_entity_key = deserialize_entity_key(
48+
serialized_entity_key, entity_key_serialization_version=3
49+
)
50+
assert deserialized_entity_key == EntityKeyProto(
51+
join_keys=["user"], entity_values=[ValueProto(int64_val=int(2**15))]
52+
)
53+
54+
55+
def test_serialize_value():
56+
v, t = _serialize_val("string_val", ValueProto(string_val="test"))
57+
assert t == ValueType.STRING
58+
assert v == b"test"
59+
60+
v, t = _serialize_val("bytes_val", ValueProto(bytes_val=b"test"))
61+
assert t == ValueType.BYTES
62+
assert v == b"test"
63+
64+
v, t = _serialize_val("int32_val", ValueProto(int32_val=1))
65+
assert t == ValueType.INT32
66+
assert v == b"\x01\x00\x00\x00"
67+
68+
# default entity_key_serialization_version is 1, so the result should be 4 bytes
69+
v, t = _serialize_val("int64_val", ValueProto(int64_val=1))
70+
assert t == ValueType.INT64
71+
assert v == b"\x01\x00\x00\x00"
72+
73+
# current entity_key_serialization_version is 2, so the result should be 8 bytes
74+
v, t = _serialize_val(
75+
"int64_val", ValueProto(int64_val=1), entity_key_serialization_version=2
76+
)
77+
assert t == ValueType.INT64
78+
assert v == b"\x01\x00\x00\x00\x00\x00\x00\x00"
79+
80+
# new entity_key_serialization_version is 3, the result should be same as version 2
81+
v, t = _serialize_val(
82+
"int64_val", ValueProto(int64_val=1), entity_key_serialization_version=3
83+
)
84+
assert t == ValueType.INT64
85+
assert v == b"\x01\x00\x00\x00\x00\x00\x00\x00"
86+
87+
88+
def test_deserialize_value():
89+
v = _deserialize_value(ValueType.STRING, b"test")
90+
assert v.string_val == "test"
91+
92+
v = _deserialize_value(ValueType.BYTES, b"test")
93+
assert v.bytes_val == b"test"
94+
95+
v = _deserialize_value(ValueType.INT32, b"\x01\x00\x00\x00")
96+
assert v.int32_val == 1
97+
98+
v = _deserialize_value(ValueType.INT64, b"\x01\x00\x00\x00\x00\x00\x00\x00")
99+
assert v.int64_val == 1

0 commit comments

Comments
 (0)