streaming_word_count.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. ################################################################################
  2. # Licensed to the Apache Software Foundation (ASF) under one
  3. # or more contributor license agreements. See the NOTICE file
  4. # distributed with this work for additional information
  5. # regarding copyright ownership. The ASF licenses this file
  6. # to you under the Apache License, Version 2.0 (the
  7. # "License"); you may not use this file except in compliance
  8. # with the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. ################################################################################
  18. import argparse
  19. import logging
  20. import sys
  21. from pyflink.common import Encoder, Types
  22. from pyflink.datastream import StreamExecutionEnvironment
  23. from pyflink.datastream.connectors.file_system import (FileSink, OutputFileConfig, RollingPolicy)
  24. from pyflink.table import StreamTableEnvironment, TableDescriptor, Schema, DataTypes
  25. words = ["flink", "window", "timer", "event_time", "processing_time", "state",
  26. "connector", "pyflink", "checkpoint", "watermark", "sideoutput", "sql",
  27. "datastream", "broadcast", "asyncio", "catalog", "batch", "streaming"]
  28. max_word_id = len(words) - 1
  29. def word_count(output_path):
  30. env = StreamExecutionEnvironment.get_execution_environment()
  31. t_env = StreamTableEnvironment.create(stream_execution_environment=env)
  32. # define the source
  33. # randomly select 5 words per second from a predefined list
  34. t_env.create_temporary_table(
  35. 'source',
  36. TableDescriptor.for_connector('datagen')
  37. .schema(Schema.new_builder()
  38. .column('word_id', DataTypes.INT())
  39. .build())
  40. .option('fields.word_id.kind', 'random')
  41. .option('fields.word_id.min', '0')
  42. .option('fields.word_id.max', str(max_word_id))
  43. .option('rows-per-second', '5')
  44. .build())
  45. table = t_env.from_path('source')
  46. ds = t_env.to_data_stream(table)
  47. def id_to_word(r):
  48. # word_id is the first column of the input row
  49. return words[r[0]]
  50. # compute word count
  51. ds = ds.map(id_to_word) \
  52. .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
  53. .key_by(lambda i: i[0]) \
  54. .reduce(lambda i, j: (i[0], i[1] + j[1]))
  55. # define the sink
  56. if output_path is not None:
  57. ds.sink_to(
  58. sink=FileSink.for_row_format(
  59. base_path=output_path,
  60. encoder=Encoder.simple_string_encoder())
  61. .with_output_file_config(
  62. OutputFileConfig.builder()
  63. .with_part_prefix("prefix")
  64. .with_part_suffix(".ext")
  65. .build())
  66. .with_rolling_policy(RollingPolicy.default_rolling_policy())
  67. .build()
  68. )
  69. else:
  70. print("Printing result to stdout. Use --output to specify output path.")
  71. ds.print()
  72. # submit for execution
  73. env.execute()
  74. if __name__ == '__main__':
  75. logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
  76. parser = argparse.ArgumentParser()
  77. parser.add_argument(
  78. '--output',
  79. dest='output',
  80. required=False,
  81. help='Output file to write results to.')
  82. argv = sys.argv[1:]
  83. known_args, _ = parser.parse_known_args(argv)
  84. word_count(known_args.output)