podcast_parser/article.py at main · flymark2010/podcast_parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
This example describes how to use the workflow interface to chat.
"""

import os, json, glob, time
from dotenv import load_dotenv  # 新增
from cozepy import COZE_CN_BASE_URL
from cozepy import Coze, TokenAuth, Message, ChatStatus, MessageContentType, WorkflowExecuteStatus

# 加载 .env 文件
load_dotenv()

class CozeClient:
    def __init__(self, workflow_id: str):
        # 从环境变量获取 token
        coze_api_token = os.getenv('COZE_API_TOKEN')
        # The default access is api.coze.com, but if you need to access api.coze.cn,
        # please use base_url to configure the api endpoint to access
        coze_api_base = COZE_CN_BASE_URL

        # Init the Coze client through the access_token.
        coze = Coze(auth=TokenAuth(token=coze_api_token), base_url=coze_api_base)

        self.coze = coze
        self.workflow_id = workflow_id

class ReaderClient(CozeClient):
    def __init__(self):
        super().__init__(workflow_id='7527581309394452526')

    def retrieve(self, page_url: str):
        """
        Read the content from the given page URL.
        :param page_url: The URL of the page to read.
        :return: A dictionary containing the content and metadata of the read page.
        """
        # Call the coze.workflows.runs.create method to create a workflow run.
        workflow = self.coze.workflows.runs.create(
            workflow_id=self.workflow_id,
            is_async=False,
            parameters={
                "page_url": page_url,
            }
        )
        return workflow

class WorkflowClient(CozeClient):
    def __init__(self):
        super().__init__('7526872708178772022')

    def submit_task(self, page_url: str):
        """
        Create articles from the given page URL.
        :param page_url: The URL of the page to create articles from.
        :return: A dictionary containing the content and metadata of the created articles.
        """

        # Call the coze.workflows.runs.create method to create a workflow run. The create method
        # is a non-streaming chat and will return a WorkflowRunResult class.
        workflow = self.coze.workflows.runs.create(
            workflow_id=self.workflow_id,
            is_async=True,
            parameters={
                "page_url": page_url,
            }
        )
        return workflow

    def history(self, execute_id: str):
        history = self.coze.workflows.runs.run_histories.retrieve(
            workflow_id=self.workflow_id,
            execute_id=execute_id
        )
        return history

def create_articles(page_url: str):
    client = WorkflowClient()
    workflow = client.submit_task(page_url)
    execute_id = workflow.execute_id

    while True:
        history = client.history(execute_id)
        if history.execute_status == WorkflowExecuteStatus.SUCCESS:
            break
        if history.execute_status == WorkflowExecuteStatus.FAILED:
            print("Workflow execution failed. Reason:", history.error_message)
            break

        time.sleep(5)


def retrieve_articles(page_url: str):
    client = ReaderClient()
    workflow = client.retrieve(page_url)

    try:
        data = json.loads(workflow.data)["output"]
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")

    return data

def save_to_markdown(input_path, output_path):
    files = glob.glob(os.path.join(input_path, "*.json"))
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            topic = data.get('topic', 'unknown_topic')
            content = data.get('content', '')
            url = data.get('url', '')

            markdown_content = f"{content}\n\n[Read more]({url})\n"
            output_file = os.path.join(output_path, f"{topic}.md")
            with open(output_file, 'w', encoding='utf-8') as out_f:
                out_f.write(markdown_content)

def get_title(content):
    return content.strip().split('\n')[0].strip().lstrip('# \t\n\r\f\v')

def save_to_json(data):
    for item in data:
        article = item['content']
        topic = item['title']
        title = get_title(article)
        item = {
            "topic": topic,
            "content": article,
            "url": item['page_url'],
            "title": title
        }

        json.dump(
            item,
            open(f"articles/{title}.json", "w", encoding="utf-8"),
            ensure_ascii=False,
            indent=2
        )

if __name__ == "__main__":
    # Example usage
    page_url = "https://www.diancang.xyz/waiguomingzhu/17921/335655.html"
    # create_articles(page_url)
    data = retrieve_articles(page_url)
    print(data)
    save_to_json(data)

    save_to_markdown("articles", "/workspace/project/podcast_articles/docs/文章/")