123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- ################################################################################
- # Licensed to the Apache Software Foundation (ASF) under one
- # or more contributor license agreements. See the NOTICE file
- # distributed with this work for additional information
- # regarding copyright ownership. The ASF licenses this file
- # to you under the Apache License, Version 2.0 (the
- # "License"); you may not use this file except in compliance
- # with the License. You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- ################################################################################
- import logging
- import sys
- from pyflink.common import Types
- from pyflink.datastream import StreamExecutionEnvironment
- from pyflink.table import (DataTypes, TableDescriptor, Schema, StreamTableEnvironment)
- from pyflink.table.expressions import col
- from pyflink.table.udf import udf
- def mixing_use_of_datastream_and_table():
- # use StreamTableEnvironment instead of TableEnvironment when mixing use of table & datastream
- env = StreamExecutionEnvironment.get_execution_environment()
- t_env = StreamTableEnvironment.create(stream_execution_environment=env)
- # define the source
- t_env.create_temporary_table(
- 'source',
- TableDescriptor.for_connector('datagen')
- .schema(Schema.new_builder()
- .column('id', DataTypes.BIGINT())
- .column('data', DataTypes.STRING())
- .build())
- .option("number-of-rows", "10")
- .build())
- # define the sink
- t_env.create_temporary_table(
- 'sink',
- TableDescriptor.for_connector('print')
- .schema(Schema.new_builder()
- .column('a', DataTypes.BIGINT())
- .build())
- .build())
- @udf(result_type=DataTypes.BIGINT())
- def length(data):
- return len(data)
- # perform table api operations
- table = t_env.from_path("source")
- table = table.select(col('id'), length(col('data')))
- # convert table to datastream and perform datastream api operations
- ds = t_env.to_data_stream(table)
- ds = ds.map(lambda i: i[0] + i[1], output_type=Types.LONG())
- # convert datastream to table and perform table api operations as you want
- table = t_env.from_data_stream(
- ds,
- Schema.new_builder().column("f0", DataTypes.BIGINT()).build())
- # execute
- table.execute_insert('sink') \
- .wait()
- # remove .wait if submitting to a remote cluster, refer to
- # https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/python/faq/#wait-for-jobs-to-finish-when-executing-jobs-in-mini-cluster
- # for more details
- if __name__ == '__main__':
- logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
- mixing_use_of_datastream_and_table()
|