tumbling_time_window.py 3.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. ################################################################################
  2. # Licensed to the Apache Software Foundation (ASF) under one
  3. # or more contributor license agreements. See the NOTICE file
  4. # distributed with this work for additional information
  5. # regarding copyright ownership. The ASF licenses this file
  6. # to you under the Apache License, Version 2.0 (the
  7. # "License"); you may not use this file except in compliance
  8. # with the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. ################################################################################
  18. import sys
  19. import argparse
  20. from typing import Iterable
  21. from pyflink.datastream.connectors.file_system import FileSink, OutputFileConfig, RollingPolicy
  22. from pyflink.common import Types, WatermarkStrategy, Time, Encoder
  23. from pyflink.common.watermark_strategy import TimestampAssigner
  24. from pyflink.datastream import StreamExecutionEnvironment, ProcessWindowFunction
  25. from pyflink.datastream.window import TumblingEventTimeWindows, TimeWindow
  26. class MyTimestampAssigner(TimestampAssigner):
  27. def extract_timestamp(self, value, record_timestamp) -> int:
  28. return int(value[1])
  29. class CountWindowProcessFunction(ProcessWindowFunction[tuple, tuple, str, TimeWindow]):
  30. def process(self,
  31. key: str,
  32. context: ProcessWindowFunction.Context[TimeWindow],
  33. elements: Iterable[tuple]) -> Iterable[tuple]:
  34. return [(key, context.window().start, context.window().end, len([e for e in elements]))]
  35. if __name__ == '__main__':
  36. parser = argparse.ArgumentParser()
  37. parser.add_argument(
  38. '--output',
  39. dest='output',
  40. required=False,
  41. help='Output file to write results to.')
  42. argv = sys.argv[1:]
  43. known_args, _ = parser.parse_known_args(argv)
  44. output_path = known_args.output
  45. env = StreamExecutionEnvironment.get_execution_environment()
  46. # write all the data to one file
  47. env.set_parallelism(1)
  48. # define the source
  49. data_stream = env.from_collection([
  50. ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9), ('hi', 15)],
  51. type_info=Types.TUPLE([Types.STRING(), Types.INT()]))
  52. # define the watermark strategy
  53. watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
  54. .with_timestamp_assigner(MyTimestampAssigner())
  55. ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
  56. .key_by(lambda x: x[0], key_type=Types.STRING()) \
  57. .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \
  58. .process(CountWindowProcessFunction(),
  59. Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()]))
  60. # define the sink
  61. if output_path is not None:
  62. ds.sink_to(
  63. sink=FileSink.for_row_format(
  64. base_path=output_path,
  65. encoder=Encoder.simple_string_encoder())
  66. .with_output_file_config(
  67. OutputFileConfig.builder()
  68. .with_part_prefix("prefix")
  69. .with_part_suffix(".ext")
  70. .build())
  71. .with_rolling_policy(RollingPolicy.default_rolling_policy())
  72. .build()
  73. )
  74. else:
  75. print("Printing result to stdout. Use --output to specify output path.")
  76. ds.print()
  77. # submit for execution
  78. env.execute()