tumbling_count_window.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. ################################################################################
  2. # Licensed to the Apache Software Foundation (ASF) under one
  3. # or more contributor license agreements. See the NOTICE file
  4. # distributed with this work for additional information
  5. # regarding copyright ownership. The ASF licenses this file
  6. # to you under the Apache License, Version 2.0 (the
  7. # "License"); you may not use this file except in compliance
  8. # with the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. ################################################################################
  18. import sys
  19. import argparse
  20. from typing import Iterable
  21. from pyflink.datastream.connectors.file_system import FileSink, OutputFileConfig, RollingPolicy
  22. from pyflink.common import Types, Encoder
  23. from pyflink.datastream import StreamExecutionEnvironment, WindowFunction
  24. from pyflink.datastream.window import CountWindow
  25. class SumWindowFunction(WindowFunction[tuple, tuple, str, CountWindow]):
  26. def apply(self, key: str, window: CountWindow, inputs: Iterable[tuple]):
  27. result = 0
  28. for i in inputs:
  29. result += i[0]
  30. return [(key, result)]
  31. if __name__ == '__main__':
  32. parser = argparse.ArgumentParser()
  33. parser.add_argument(
  34. '--output',
  35. dest='output',
  36. required=False,
  37. help='Output file to write results to.')
  38. argv = sys.argv[1:]
  39. known_args, _ = parser.parse_known_args(argv)
  40. output_path = known_args.output
  41. env = StreamExecutionEnvironment.get_execution_environment()
  42. # write all the data to one file
  43. env.set_parallelism(1)
  44. # define the source
  45. data_stream = env.from_collection([
  46. (1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello'), (6, 'hello')],
  47. type_info=Types.TUPLE([Types.INT(), Types.STRING()]))
  48. ds = data_stream.key_by(lambda x: x[1], key_type=Types.STRING()) \
  49. .count_window(2) \
  50. .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()]))
  51. # define the sink
  52. if output_path is not None:
  53. ds.sink_to(
  54. sink=FileSink.for_row_format(
  55. base_path=output_path,
  56. encoder=Encoder.simple_string_encoder())
  57. .with_output_file_config(
  58. OutputFileConfig.builder()
  59. .with_part_prefix("prefix")
  60. .with_part_suffix(".ext")
  61. .build())
  62. .with_rolling_policy(RollingPolicy.default_rolling_policy())
  63. .build()
  64. )
  65. else:
  66. print("Printing result to stdout. Use --output to specify output path.")
  67. ds.print()
  68. # submit for execution
  69. env.execute()