-
Notifications
You must be signed in to change notification settings - Fork 20
feat: 新增流程创建任务并发控制 --story=128208486 #469
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ | |
| from bkflow.contrib.api.collections.interface import InterfaceModuleClient | ||
| from bkflow.exceptions import ValidationError | ||
| from bkflow.pipeline_plugins.components.collections.base import BKFlowBaseService | ||
| from bkflow.task.utils import push_task_to_queue | ||
|
|
||
|
|
||
| class Subprocess(BaseModel): | ||
|
|
@@ -138,7 +139,7 @@ def _create_subprocess_task_instance(self, subprocess, template, pipeline_tree, | |
| from bkflow.task.utils import extract_extra_info | ||
|
|
||
| with transaction.atomic(): | ||
| time_zone = timezone.pytz.timezone(settings.TIME_ZONE) or "Asia/Shanghai" | ||
| time_zone = timezone.pytz.timezone(settings.TIME_ZONE) | ||
| time_stamp = datetime.datetime.now(tz=time_zone).strftime("%Y%m%d%H%M%S") | ||
| create_task_data = { | ||
| "name": f"{subprocess.subprocess_name}_子流程_{time_stamp}", | ||
|
|
@@ -165,7 +166,7 @@ def _create_subprocess_task_instance(self, subprocess, template, pipeline_tree, | |
| except TaskFlowRelation.DoesNotExist: | ||
| root_task_id = parent_task.id | ||
|
|
||
| relate_info = {"node_id": self.id, "node_version": self.version} | ||
| relate_info = {"node_id": self.id, "node_version": self.version, "parent_task_id": parent_task.id} | ||
| TaskFlowRelation.objects.create( | ||
| task_id=task_instance.id, | ||
| parent_task_id=parent_task.id, | ||
|
|
@@ -189,6 +190,7 @@ def _create_subprocess_task_instance(self, subprocess, template, pipeline_tree, | |
| def plugin_execute(self, data, parent_data): | ||
| from bkflow.task.models import TaskInstance | ||
| from bkflow.task.operations import TaskOperation | ||
| from bkflow.task.utils import count_running_tasks | ||
|
|
||
| parent_task_id = parent_data.get_one_of_inputs("task_id") | ||
| try: | ||
|
|
@@ -210,6 +212,16 @@ def plugin_execute(self, data, parent_data): | |
|
|
||
| # 设置输出并启动任务 | ||
| data.set_outputs("task_id", task_instance.id) | ||
| interface_client = InterfaceModuleClient() | ||
| space_infos_result = interface_client.get_space_infos( | ||
| {"space_id": task_instance.space_id, "config_names": "concurrency_control"} | ||
| ) | ||
| space_configs = space_infos_result.get("data", {}).get("configs", {}) | ||
| concurrency_control = space_configs.get("concurrency_control", 0) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 现在代码里会越来越多需要读取空间配置来做调度控制的逻辑,这里获取空间配置的逻辑最好抽象一下,并作为后台worker的一个全局单例来复用配置,空间配置不是一个经常变的数据,可以适当加一个缓存(1分钟),来防止任务并发量大后,频繁调接口访问interface的问题 |
||
|
|
||
| if concurrency_control and count_running_tasks(task_instance) >= int(concurrency_control): | ||
| push_task_to_queue(settings.redis_inst, task_instance, "start") | ||
| return True | ||
| task_operation = TaskOperation(task_instance=task_instance, queue=settings.BKFLOW_MODULE.code) | ||
| operation_method = getattr(task_operation, "start", None) | ||
| if operation_method is None: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -118,10 +118,11 @@ class TaskInstanceSerializer(serializers.ModelSerializer): | |
| create_time = serializers.DateTimeField(format="%Y-%m-%d %H:%M:%S%z") | ||
| start_time = serializers.DateTimeField(format="%Y-%m-%d %H:%M:%S%z") | ||
| finish_time = serializers.DateTimeField(format="%Y-%m-%d %H:%M:%S%z") | ||
| is_wait = serializers.SerializerMethodField() | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里改成is_waiting吧 |
||
|
|
||
| class Meta: | ||
| model = TaskInstance | ||
| fields = "__all__" | ||
| exclude = ["extra_info"] | ||
| read_only_fields = ( | ||
| "id", | ||
| "instance_id", | ||
|
|
@@ -141,6 +142,9 @@ class Meta: | |
| "tree_info_id", | ||
| ) | ||
|
|
||
| def get_is_wait(self, instance): | ||
| return instance.extra_info.get("is_waiting", False) | ||
|
|
||
|
|
||
| class RetrieveTaskInstanceSerializer(TaskInstanceSerializer): | ||
| pipeline_tree = serializers.SerializerMethodField() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -166,3 +166,72 @@ def extract_extra_info(constants, keys=None): | |
| for key in list(constants.keys()) if not keys else keys: | ||
| extra_info.update({key: {"name": constants[key]["name"], "value": constants[key]["value"]}}) | ||
| return json.dumps(extra_info, ensure_ascii=False) | ||
|
|
||
|
|
||
| @redis_inst_check | ||
| def push_task_to_queue(redis_cli, task, operation, node_id=None, data=None): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 需要加一个最大限制,以免打爆redis的队列,如果超出这个上限了,可以在触发执行任务的地方给用户返回错误,告知当前流程达到最大的执行等待上限 |
||
| template_id = task.template_id | ||
| redis_key = f"task_wait_{template_id}" | ||
|
|
||
| task_data = {"operation": operation, "task_id": task.id} | ||
| if node_id: | ||
| task_data.update({"node_id": node_id}) | ||
| if data: | ||
| task_data.update({"node_data": data}) | ||
| task_json = json.dumps(task_data) | ||
| redis_cli.rpush(redis_key, task_json) | ||
| task.extra_info.update({"is_waiting": True}) | ||
| task.save() | ||
| return True | ||
|
|
||
|
|
||
| @redis_inst_check | ||
| def process_task_from_queue(redis_cli, instance_id): | ||
| from bkflow.task.models import TaskInstance | ||
| from bkflow.task.operations import TaskNodeOperation, TaskOperation | ||
|
|
||
| template_id = TaskInstance.objects.get(instance_id=instance_id).template_id | ||
| redis_key = f"task_wait_{template_id}" | ||
| task_json = redis_cli.lpop(redis_key) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里取出任务后,如果调度有问题,任务就丢了,需要加一下保证机制 |
||
| if not task_json: | ||
| return None | ||
|
|
||
| task_data = json.loads(task_json) | ||
| operation = task_data.get("operation") | ||
| task_instance = TaskInstance.objects.get(id=task_data.get("task_id")) | ||
| if operation in ["start", "resume"]: | ||
| task_operation = TaskOperation(task_instance, settings.BKFLOW_MODULE.code) | ||
| operation_method = getattr(task_operation, operation, None) | ||
| else: | ||
| node_operation = TaskNodeOperation(task_instance, task_data.get("node_id")) | ||
| operation_method = getattr(node_operation, operation, None) | ||
|
|
||
| operation_method(operator=operation, **task_data.get("node_data", {})) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里确认下是否是一个同步操作,如果是同步操作,这里的任务从队列取出并执行的动作,不应该在bamboo_engine_eri_post_set_state_handler这个用于设置状态的逻辑里(而且这里会导致队列里的worker始终跟上一个无关任务在同一个worker中,使任务不够分散),应该重新把任务通过celery来分发出去执行
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 在分发任务出去执行时,需要关注下trace的传递是正确的,否则对于这种api或者页面触发变成后台触发的任务,就会丢掉trace相关的信息了 |
||
| task_instance.extra_info.update({"is_waiting": False}) | ||
| task_instance.save() | ||
| return task_instance | ||
|
|
||
|
|
||
| def count_running_tasks(task_instance): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个实现还是太重了,并发量一大,每个任务都需要统计一遍当前task_instances的数据量,db扛不住的 |
||
| from bkflow.task.models import TaskInstance | ||
| from bkflow.task.operations import TaskOperation | ||
|
|
||
| space_id = task_instance.space_id | ||
| template_id = task_instance.template_id | ||
| task_instances = TaskInstance.objects.filter( | ||
| space_id=space_id, template_id=template_id, is_deleted=False, is_started=True, is_finished=False | ||
| ) | ||
|
|
||
| task_operations = [ | ||
| {"task_id": task_instance.id, "operation": TaskOperation(task_instance=task_instance).get_task_states()} | ||
| for task_instance in task_instances | ||
| ] | ||
|
|
||
| task_count = 0 | ||
| for task_operation in task_operations: | ||
| if task_operation["operation"].result is False: | ||
| continue | ||
| if task_operation["operation"].data.get("state") == "RUNNING": | ||
| task_count += 1 | ||
|
|
||
| return task_count | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
整体方案还需要补充一些可观测性的指标,来发现当前系统中,某些流程的队列在不断增加,有达到上限的风险,这个可以通过with start_trace里注入流程当前队列长度来实现