session_with_dynamic_gap_window.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. ################################################################################
  2. # Licensed to the Apache Software Foundation (ASF) under one
  3. # or more contributor license agreements. See the NOTICE file
  4. # distributed with this work for additional information
  5. # regarding copyright ownership. The ASF licenses this file
  6. # to you under the Apache License, Version 2.0 (the
  7. # "License"); you may not use this file except in compliance
  8. # with the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. ################################################################################
  18. import sys
  19. import argparse
  20. from typing import Iterable
  21. from pyflink.datastream.connectors.file_system import FileSink, OutputFileConfig, RollingPolicy
  22. from pyflink.common import Types, WatermarkStrategy, Encoder
  23. from pyflink.common.watermark_strategy import TimestampAssigner
  24. from pyflink.datastream import StreamExecutionEnvironment, ProcessWindowFunction
  25. from pyflink.datastream.window import EventTimeSessionWindows, \
  26. SessionWindowTimeGapExtractor, TimeWindow
  27. class MyTimestampAssigner(TimestampAssigner):
  28. def extract_timestamp(self, value, record_timestamp) -> int:
  29. return int(value[1])
  30. class MySessionWindowTimeGapExtractor(SessionWindowTimeGapExtractor):
  31. def extract(self, element: tuple) -> int:
  32. return element[1]
  33. class CountWindowProcessFunction(ProcessWindowFunction[tuple, tuple, str, TimeWindow]):
  34. def process(self,
  35. key: str,
  36. context: ProcessWindowFunction.Context[TimeWindow],
  37. elements: Iterable[tuple]) -> Iterable[tuple]:
  38. return [(key, context.window().start, context.window().end, len([e for e in elements]))]
  39. if __name__ == '__main__':
  40. parser = argparse.ArgumentParser()
  41. parser.add_argument(
  42. '--output',
  43. dest='output',
  44. required=False,
  45. help='Output file to write results to.')
  46. argv = sys.argv[1:]
  47. known_args, _ = parser.parse_known_args(argv)
  48. output_path = known_args.output
  49. env = StreamExecutionEnvironment.get_execution_environment()
  50. # write all the data to one file
  51. env.set_parallelism(1)
  52. # define the source
  53. data_stream = env.from_collection([
  54. ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)],
  55. type_info=Types.TUPLE([Types.STRING(), Types.INT()]))
  56. # define the watermark strategy
  57. watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
  58. .with_timestamp_assigner(MyTimestampAssigner())
  59. ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
  60. .key_by(lambda x: x[0], key_type=Types.STRING()) \
  61. .window(EventTimeSessionWindows.with_dynamic_gap(MySessionWindowTimeGapExtractor())) \
  62. .process(CountWindowProcessFunction(),
  63. Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()]))
  64. # define the sink
  65. if output_path is not None:
  66. ds.sink_to(
  67. sink=FileSink.for_row_format(
  68. base_path=output_path,
  69. encoder=Encoder.simple_string_encoder())
  70. .with_output_file_config(
  71. OutputFileConfig.builder()
  72. .with_part_prefix("prefix")
  73. .with_part_suffix(".ext")
  74. .build())
  75. .with_rolling_policy(RollingPolicy.default_rolling_policy())
  76. .build()
  77. )
  78. else:
  79. print("Printing result to stdout. Use --output to specify output path.")
  80. ds.print()
  81. # submit for execution
  82. env.execute()