kafka_json_format.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. ################################################################################
  2. # Licensed to the Apache Software Foundation (ASF) under one
  3. # or more contributor license agreements. See the NOTICE file
  4. # distributed with this work for additional information
  5. # regarding copyright ownership. The ASF licenses this file
  6. # to you under the Apache License, Version 2.0 (the
  7. # "License"); you may not use this file except in compliance
  8. # with the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. ################################################################################
  18. import logging
  19. import sys
  20. from pyflink.common import Types
  21. from pyflink.datastream import StreamExecutionEnvironment
  22. from pyflink.datastream.connectors.kafka import FlinkKafkaProducer, FlinkKafkaConsumer
  23. from pyflink.datastream.formats.json import JsonRowSerializationSchema, JsonRowDeserializationSchema
  24. # Make sure that the Kafka cluster is started and the topic 'test_json_topic' is
  25. # created before executing this job.
  26. def write_to_kafka(env):
  27. type_info = Types.ROW([Types.INT(), Types.STRING()])
  28. ds = env.from_collection(
  29. [(1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello'), (6, 'hello')],
  30. type_info=type_info)
  31. serialization_schema = JsonRowSerializationSchema.Builder() \
  32. .with_type_info(type_info) \
  33. .build()
  34. kafka_producer = FlinkKafkaProducer(
  35. topic='test_json_topic',
  36. serialization_schema=serialization_schema,
  37. producer_config={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'}
  38. )
  39. # note that the output type of ds must be RowTypeInfo
  40. ds.add_sink(kafka_producer)
  41. env.execute()
  42. def read_from_kafka(env):
  43. deserialization_schema = JsonRowDeserializationSchema.Builder() \
  44. .type_info(Types.ROW([Types.INT(), Types.STRING()])) \
  45. .build()
  46. kafka_consumer = FlinkKafkaConsumer(
  47. topics='test_json_topic',
  48. deserialization_schema=deserialization_schema,
  49. properties={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group_1'}
  50. )
  51. kafka_consumer.set_start_from_earliest()
  52. env.add_source(kafka_consumer).print()
  53. env.execute()
  54. if __name__ == '__main__':
  55. logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
  56. env = StreamExecutionEnvironment.get_execution_environment()
  57. env.add_jars("file:///path/to/flink-sql-connector-kafka-1.15.0.jar")
  58. print("start writing data to kafka")
  59. write_to_kafka(env)
  60. print("start reading data from kafka")
  61. read_from_kafka(env)