Sérialisation avec protobuf#

protobuf optimise la sérialisation de deux façons. Elle accélère l’écriture et la lecture des données et permet aussi un accès rapide à une information précise dans désérialiser les autres. Elle réalise cela en imposant un schéma strict de données.

L’exemple fonctionne si l’exécutable protoc et le package protobuf ont des versions compatibles. Un message apparaîtra dans le cas contraire.

protoc --version
python -c "import google.protobuf as gp;print(gp.__version__)"

Schéma#

On récupère l’exemple du tutorial.

import os
import sys
import timeit
import struct
from io import BytesIO
from sphinx_runpython.runpython import run_cmd
import google.protobuf as gp
from google.protobuf.json_format import MessageToJson, Parse as ParseJson

schema = """
syntax = "proto2";

package tutorial;

message Person {
  optional string name = 1;
  optional int32 id = 2;
  optional string email = 3;

  enum PhoneType {
    MOBILE = 0;
    HOME = 1;
    WORK = 2;
  }

  message PhoneNumber {
    optional string number = 1;
    optional PhoneType type = 2 [default = HOME];
  }

  repeated PhoneNumber phones = 4;
}

message AddressBook {
  repeated Person people = 1;
}
"""

Compilation#

Il faut d’abord récupérer le compilateur. Cela peut se faire depuis le site de protobuf ou sur Linux (Ubuntu/Debian) apt-get install protobuf-compiler pour obtenir le programme protoc.

'4.23.4'
with open("schema.proto", "w") as f:
    f.write(schema)


# Et on peut compiler.

# In[8]:


cmd = "protoc --python_out=. schema.proto"
out, err = run_cmd(cmd=cmd, wait=True)
print(out)
print(err)

Un fichier a été généré.

[_ for _ in os.listdir(".") if ".py" in _]
['plot_lambda_function.py', 'plot_pandas_groupby.py', 'plot_tarabiscote.py', 'plot_serialisation_examples.py', 'plot_partie_dame.py', 'plot_serialisation_protobuf.py', 'plot_gil_example.py', 'plot_numpy_tricks.py', 'plot_float_and_double_rouding.py', 'plot_hypercube.py', 'schema_pb2.py']
with open("schema_pb2.py", "r") as f:
    content = f.read()
print(content[:1000])
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler.  DO NOT EDIT!
# source: schema.proto
"""Generated protocol buffer code."""
from google.protobuf.internal import builder as _builder
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
# @@protoc_insertion_point(imports)

_sym_db = _symbol_database.Default()




DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0cschema.proto\x12\x08tutorial\"\xdb\x01\n\x06Person\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\x05\x12\r\n\x05\x65mail\x18\x03 \x01(\t\x12,\n\x06phones\x18\x04 \x03(\x0b\x32\x1c.tutorial.Person.PhoneNumber\x1aM\n\x0bPhoneNumber\x12\x0e\n\x06number\x18\x01 \x01(\t\x12.\n\x04type\x18\x02 \x01(\x0e\x32\x1a.tutorial.Person.PhoneType:\x04HOME\"+\n\tPhoneType\x12\n\n\x06MOBILE\x10\x00\x12\x08\n\x04HOME\x10\x01\x12\x08\n\x04WORK\x10\x02\"/\n\x0b\x41\x64\x64r

Import du module créé#

Pour utliser protobuf, il faut importer le module créé.

sys.path.append(".")
import schema_pb2  # noqa: E402

On créé un enregistrement.

person = schema_pb2.Person()
person.id = 1234
person.name = "John Doe"
person.email = "jdoe@example.com"
phone = person.phones.add()
phone.number = "555-4321"
phone.type = schema_pb2.Person.HOME
person
name: "John Doe"
id: 1234
email: "jdoe@example.com"
phones {
  number: "555-4321"
  type: HOME
}

Sérialisation en chaîne de caractères#

res = person.SerializeToString()
type(res), res
(<class 'bytes'>, b'\n\x08John Doe\x10\xd2\t\x1a\x10jdoe@example.com"\x0c\n\x08555-4321\x10\x01')
timeit.timeit("person.SerializeToString()", globals=globals(), number=100)
0.0001368000002912595
pers = schema_pb2.Person.FromString(res)
pers
name: "John Doe"
id: 1234
email: "jdoe@example.com"
phones {
  number: "555-4321"
  type: HOME
}
pers = schema_pb2.Person()
pers.ParseFromString(res)
pers
name: "John Doe"
id: 1234
email: "jdoe@example.com"
phones {
  number: "555-4321"
  type: HOME
}
timeit.timeit("schema_pb2.Person.FromString(res)", globals=globals(), number=100)
0.00017549999938637484
timeit.timeit("pers.ParseFromString(res)", globals=globals(), number=100)
9.479999971517827e-05

Plusieurs chaînes de caractères#

db = []

person = schema_pb2.Person()
person.id = 1234
person.name = "John Doe"
person.email = "jdoe@example.com"
phone = person.phones.add()
phone.number = "555-4321"
phone.type = schema_pb2.Person.HOME
db.append(person)

person = schema_pb2.Person()
person.id = 5678
person.name = "Johnette Doette"
person.email = "jtdoet@example2.com"
phone = person.phones.add()
phone.number = "777-1234"
phone.type = schema_pb2.Person.MOBILE
db.append(person)
buffer = BytesIO()
for p in db:
    size = p.ByteSize()
    buffer.write(struct.pack("i", size))
    buffer.write(p.SerializeToString())
res = buffer.getvalue()
res
b'-\x00\x00\x00\n\x08John Doe\x10\xd2\t\x1a\x10jdoe@example.com"\x0c\n\x08555-4321\x10\x017\x00\x00\x00\n\x0fJohnette Doette\x10\xae,\x1a\x13jtdoet@example2.com"\x0c\n\x08777-1234\x10\x00'
db2 = []
buffer = BytesIO(res)
n = 0
while True:
    bsize = buffer.read(4)
    if len(bsize) == 0:
        # C'est fini.
        break
    size = struct.unpack("i", bsize)[0]
    data = buffer.read(size)
    p = schema_pb2.Person.FromString(data)
    db2.append(p)
db2[0], db2[1]
(name: "John Doe"
id: 1234
email: "jdoe@example.com"
phones {
  number: "555-4321"
  type: HOME
}
, name: "Johnette Doette"
id: 5678
email: "jtdoet@example2.com"
phones {
  number: "777-1234"
  type: MOBILE
}
)

Sérialisation JSON#

print(MessageToJson(pers))
{
  "name": "John Doe",
  "id": 1234,
  "email": "jdoe@example.com",
  "phones": [
    {
      "number": "555-4321",
      "type": "HOME"
    }
  ]
}
timeit.timeit("MessageToJson(pers)", globals=globals(), number=100)
0.015570000000479922
js = MessageToJson(pers)
res = ParseJson(js, message=schema_pb2.Person())
res
name: "John Doe"
id: 1234
email: "jdoe@example.com"
phones {
  number: "555-4321"
  type: HOME
}
timeit.timeit(
    "ParseJson(js, message=schema_pb2.Person())", globals=globals(), number=100
)
0.012188699999569508

Total running time of the script: ( 0 minutes 0.501 seconds)

Gallery generated by Sphinx-Gallery