Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -169,4 +169,10 @@ openai_api/data/fine_food_reviews_with_embeddings_1k_2146.csv
*.bin

langchain/openai-translator/flagged/*
langchain/openai-translator/flask_temps/*
langchain/openai-translator/flask_temps/*
openai-translator/ai_translator/.idea/ai_translator.iml
openai-translator/ai_translator/.idea/inspectionProfiles/profiles_settings.xml
openai-translator/ai_translator/.idea/inspectionProfiles/Project_Default.xml
openai-translator/ai_translator/.idea/misc.xml
openai-translator/ai_translator/.idea/modules.xml
openai-translator/ai_translator/.idea/vcs.xml
313 changes: 313 additions & 0 deletions langchain/jupyter/agents/openai_functionTest.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-03-29T12:01:26.511156800Z",
"start_time": "2024-03-29T12:01:20.975005400Z"
}
},
"outputs": [],
"source": [
"from langchain_openai import ChatOpenAI\n",
"\n",
"llm = ChatOpenAI(temperature=0)"
]
},
{
"cell_type": "code",
"outputs": [],
"source": [
"from langchain.agents import tool\n",
"\n",
"@tool\n",
"def get_word_length(word: str) -> int:\n",
" \"\"\"Returns the length of a word.\"\"\"\n",
" return len(word)\n",
"\n",
"tools = [get_word_length]"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T12:02:39.804177100Z",
"start_time": "2024-03-29T12:02:38.049455200Z"
}
},
"id": "2d9fe2c1a3654657",
"execution_count": 2
},
{
"cell_type": "code",
"outputs": [],
"source": [
"from langchain.schema import SystemMessage\n",
"from langchain.agents import OpenAIFunctionsAgent\n",
"\n",
"system_message = SystemMessage(content=\"你是非常强大的AI助手,但在计算单词长度方面不擅长。\")\n",
"prompt = OpenAIFunctionsAgent.create_prompt(system_message=system_message)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T12:04:19.417806500Z",
"start_time": "2024-03-29T12:04:19.341513200Z"
}
},
"id": "54350a24c993624d",
"execution_count": 3
},
{
"cell_type": "code",
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\soft\\Python312\\Lib\\site-packages\\langchain_core\\_api\\deprecation.py:117: LangChainDeprecationWarning: The class `langchain.agents.openai_functions_agent.base.OpenAIFunctionsAgent` was deprecated in langchain 0.1.0 and will be removed in 0.2.0. Use create_openai_functions_agent instead.\n",
" warn_deprecated(\n"
]
}
],
"source": [
"agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T12:04:53.611433300Z",
"start_time": "2024-03-29T12:04:52.422558700Z"
}
},
"id": "7e3e7c9a1e101a69",
"execution_count": 4
},
{
"cell_type": "code",
"outputs": [],
"source": [
"from langchain.agents import AgentExecutor\n",
"\n",
"agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T12:05:42.170247400Z",
"start_time": "2024-03-29T12:05:42.150934200Z"
}
},
"id": "17922ddf5f435a1d",
"execution_count": 5
},
{
"cell_type": "code",
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\soft\\Python312\\Lib\\site-packages\\langchain_core\\_api\\deprecation.py:117: LangChainDeprecationWarning: The function `run` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
" warn_deprecated(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001B[1m> Entering new AgentExecutor chain...\u001B[0m\n",
"\u001B[32;1m\u001B[1;3m\n",
"Invoking: `get_word_length` with `{'word': 'educa'}`\n",
"\n",
"\n",
"\u001B[0m\u001B[36;1m\u001B[1;3m5\u001B[0m\u001B[32;1m\u001B[1;3m单词“educa”中有5个字母。\u001B[0m\n",
"\n",
"\u001B[1m> Finished chain.\u001B[0m\n"
]
},
{
"data": {
"text/plain": "'单词“educa”中有5个字母。'"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent_executor.run(\"单词“educa”中有多少个字母?\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T12:06:13.829616200Z",
"start_time": "2024-03-29T12:06:11.478938400Z"
}
},
"id": "bc07104a44ab6d5b",
"execution_count": 6
},
{
"cell_type": "code",
"outputs": [],
"source": [
"from langchain.prompts import MessagesPlaceholder\n",
"\n",
"MEMORY_KEY = \"chat_history\"\n",
"prompt = OpenAIFunctionsAgent.create_prompt(\n",
" system_message=system_message,\n",
" extra_prompt_messages=[MessagesPlaceholder(variable_name=MEMORY_KEY)]\n",
")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T12:07:55.933638900Z",
"start_time": "2024-03-29T12:07:55.593759400Z"
}
},
"id": "ccb7cece70540a30",
"execution_count": 7
},
{
"cell_type": "code",
"outputs": [],
"source": [
"from langchain.memory import ConversationBufferMemory\n",
"\n",
"memory = ConversationBufferMemory(memory_key=MEMORY_KEY, return_messages=True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T12:08:53.107717300Z",
"start_time": "2024-03-29T12:08:53.102043400Z"
}
},
"id": "addb142685d40adb",
"execution_count": 8
},
{
"cell_type": "code",
"outputs": [],
"source": [
"agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt)\n",
"agent_executor = AgentExecutor(agent=agent, tools=tools, memory=memory, verbose=True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T12:09:50.125508800Z",
"start_time": "2024-03-29T12:09:50.116425500Z"
}
},
"id": "dce86e5dc17f716f",
"execution_count": 9
},
{
"cell_type": "code",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001B[1m> Entering new AgentExecutor chain...\u001B[0m\n",
"\u001B[32;1m\u001B[1;3m\n",
"Invoking: `get_word_length` with `{'word': 'educa'}`\n",
"\n",
"\n",
"\u001B[0m\u001B[36;1m\u001B[1;3m5\u001B[0m\u001B[32;1m\u001B[1;3m单词“educa”中有5个字母。\u001B[0m\n",
"\n",
"\u001B[1m> Finished chain.\u001B[0m\n"
]
},
{
"data": {
"text/plain": "'单词“educa”中有5个字母。'"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent_executor.run(\"单词“educa”中有多少个字母?\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T12:10:15.044103900Z",
"start_time": "2024-03-29T12:10:12.383340Z"
}
},
"id": "f4a142b8bb0cbc0d",
"execution_count": 10
},
{
"cell_type": "code",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001B[1m> Entering new AgentExecutor chain...\u001B[0m\n",
"\u001B[32;1m\u001B[1;3m抱歉,我之前给出的答案有误。单词“educa”实际上不是一个真实的单词。请问您还有其他问题吗?\u001B[0m\n",
"\n",
"\u001B[1m> Finished chain.\u001B[0m\n"
]
},
{
"data": {
"text/plain": "'抱歉,我之前给出的答案有误。单词“educa”实际上不是一个真实的单词。请问您还有其他问题吗?'"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent_executor.run(\"那是一个真实的单词吗?\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T12:10:45.587808900Z",
"start_time": "2024-03-29T12:10:43.443403100Z"
}
},
"id": "6fe525108f45b5da",
"execution_count": 11
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
294 changes: 294 additions & 0 deletions langchain/jupyter/agents/reactTest.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-03-29T11:59:53.906392600Z",
"start_time": "2024-03-29T11:59:53.892848300Z"
}
},
"outputs": [],
"source": [
"import os\n",
"#os.environ[\"SERPAPI_API_KEY\"] = \"\""
]
},
{
"cell_type": "code",
"outputs": [
{
"data": {
"text/plain": "'a02b49e6015aeb93ce4dbfd99b57418a5fcf37d62d65150fe58f05ea0a4dee17'"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.environ[\"SERPAPI_API_KEY\"]"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T11:59:53.921749100Z",
"start_time": "2024-03-29T11:59:53.910572700Z"
}
},
"id": "c44e8fe66e3e5f06",
"execution_count": 2
},
{
"cell_type": "code",
"outputs": [],
"source": [
"from langchain_openai import OpenAI\n",
"\n",
"llm = OpenAI(temperature=0)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T11:46:52.577934400Z",
"start_time": "2024-03-29T11:46:46.942689500Z"
}
},
"id": "1e11ac9db94a3248",
"execution_count": 4
},
{
"cell_type": "code",
"outputs": [],
"source": [
"from langchain.agents import load_tools\n",
"from langchain.agents import initialize_agent\n",
"from langchain.agents import AgentType\n",
"\n",
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T11:48:45.628852700Z",
"start_time": "2024-03-29T11:48:45.603825200Z"
}
},
"id": "4453850349cd8662",
"execution_count": 7
},
{
"cell_type": "code",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
"Collecting numexpr\n",
" Downloading https://pypi.tuna.tsinghua.edu.cn/packages/d0/ff/87d5439fef391607dca416af15a07aebcceccf7e3816966507c11b58a2a3/numexpr-2.9.0-cp312-cp312-win_amd64.whl (96 kB)\n",
" ---------------------------------------- 0.0/96.7 kB ? eta -:--:--\n",
" -------------------------------------- - 92.2/96.7 kB 2.6 MB/s eta 0:00:01\n",
" ---------------------------------------- 96.7/96.7 kB 1.9 MB/s eta 0:00:00\n",
"Requirement already satisfied: numpy>=1.13.3 in d:\\soft\\python312\\lib\\site-packages (from numexpr) (1.26.4)\n",
"Installing collected packages: numexpr\n",
"Successfully installed numexpr-2.9.0\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install numexpr"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T11:48:39.002740800Z",
"start_time": "2024-03-29T11:48:33.036875100Z"
}
},
"id": "82e339dbb3363e2e",
"execution_count": 6
},
{
"cell_type": "code",
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\soft\\Python312\\Lib\\site-packages\\langchain_core\\_api\\deprecation.py:117: LangChainDeprecationWarning: The function `initialize_agent` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use Use new agent constructor methods like create_react_agent, create_json_agent, create_structured_chat_agent, etc. instead.\n",
" warn_deprecated(\n"
]
}
],
"source": [
"agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T11:49:42.587337300Z",
"start_time": "2024-03-29T11:49:42.544704200Z"
}
},
"id": "c7752fb2880e1f50",
"execution_count": 8
},
{
"cell_type": "code",
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\soft\\Python312\\Lib\\site-packages\\langchain_core\\_api\\deprecation.py:117: LangChainDeprecationWarning: The function `run` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
" warn_deprecated(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001B[1m> Entering new AgentExecutor chain...\u001B[0m\n",
"\u001B[32;1m\u001B[1;3m I should use a search engine to find information about Leonardo DiCaprio's girlfriend and then use a calculator to calculate her age.\n",
"Action: Search\n",
"Action Input: \"Leonardo DiCaprio girlfriend\"\u001B[0m\n",
"Observation: \u001B[36;1m\u001B[1;3m[{'title': 'Is Leonardo DiCaprio Engaged to Vittoria Ceretti?', 'link': 'https://www.thecut.com/article/leonardo-dicaprio-vittoria-ceretti-engagement-ring.html', 'source': 'The Cut', 'date': '1 day ago', 'thumbnail': 'https://serpapi.com/searches/6606ab095f2203b7ae8d7f5b/images/7db95bf161919ea5f1437993aeba4501fe9fba0acebd51c0.jpeg'}, {'title': 'Leonardo DiCaprio, 49, gets cozy again with model Vittoria Ceretti, 25, and she even flashes a ring on THAT fi', 'link': 'https://www.dailymail.co.uk/tvshowbiz/article-13243303/Leonardo-DiCaprios-girlfriend-Vittoria-Ceretti-wearing-ring.html', 'source': 'Daily Mail', 'date': '2 days ago', 'thumbnail': 'https://serpapi.com/searches/6606ab095f2203b7ae8d7f5b/images/7db95bf161919ea5cc24078b67e8880b2a2065d86299c6fa.jpeg'}, {'title': 'Leonardo DiCaprio, 49, and Vittoria Ceretti, 25, spark engagement rumors as she flaunts new ring', 'link': 'https://pagesix.com/2024/03/27/entertainment/leonardo-dicaprio-vittoria-ceretti-spark-engagement-rumors/', 'source': 'Page Six', 'date': '1 day ago', 'thumbnail': 'https://serpapi.com/searches/6606ab095f2203b7ae8d7f5b/images/7db95bf161919ea541824bea9924d255b388ee4b94430297.jpeg'}, {'title': 'Leonardo DiCaprio sparks engagement rumors as girlfriend Vittoria Ceretti wears ring', 'link': 'https://www.marca.com/en/lifestyle/celebrities/2024/03/27/66047769ca47418c0e8b4586.html', 'source': 'Marca.com', 'date': '1 day ago', 'thumbnail': 'https://serpapi.com/searches/6606ab095f2203b7ae8d7f5b/images/7db95bf161919ea53904a654108da52820108789c1eba021.jpeg'}, {'title': 'Leonardo DiCaprio Sparks Engagement Rumors As Girlfriend Flaunts New NDA', 'link': 'https://www.theonion.com/leonardo-dicaprio-sparks-engagement-rumors-as-girlfrien-1851372103', 'source': 'The Onion', 'date': '17 hours ago', 'thumbnail': 'https://serpapi.com/searches/6606ab095f2203b7ae8d7f5b/images/7db95bf161919ea527cbadd3e3a65364753b1fe0e1644e19.jpeg'}]\u001B[0m\n",
"Thought:\u001B[32;1m\u001B[1;3m Based on the search results, it seems like Leonardo DiCaprio's current girlfriend is model Vittoria Ceretti.\n",
"Action: Calculator\n",
"Action Input: 25^0.43\u001B[0m\n",
"Observation: \u001B[33;1m\u001B[1;3mAnswer: 3.991298452658078\u001B[0m\n",
"Thought:\u001B[32;1m\u001B[1;3m This is the age of Vittoria Ceretti, Leonardo DiCaprio's girlfriend.\n",
"Final Answer: 3.991298452658078\u001B[0m\n",
"\n",
"\u001B[1m> Finished chain.\u001B[0m\n"
]
},
{
"data": {
"text/plain": "'3.991298452658078'"
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.run(\"谁是莱昂纳多·迪卡普里奥的女朋友?她现在年龄的0.43次方是多少?\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T11:50:59.530688400Z",
"start_time": "2024-03-29T11:50:30.210495300Z"
}
},
"id": "52c7fd4ebdf9f425",
"execution_count": 9
},
{
"cell_type": "code",
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
},
"id": "bad1d32af38c87fc"
},
{
"cell_type": "markdown",
"source": [
"使用聊天模型(GPT-4)"
],
"metadata": {
"collapsed": false
},
"id": "e503b910460133d1"
},
{
"cell_type": "code",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001B[1m> Entering new AgentExecutor chain...\u001B[0m\n",
"\u001B[32;1m\u001B[1;3mThought: The first part of the question is about Leonardo DiCaprio's girlfriend, which I can search for. The second part involves a mathematical calculation that I can perform using a calculator.\n",
"\n",
"Action:\n",
"```\n",
"{\n",
" \"action\": \"Search\",\n",
" \"action_input\": \"Leonardo DiCaprio current girlfriend\"\n",
"}\n",
"```\u001B[0m\n",
"Observation: \u001B[36;1m\u001B[1;3mVittoria Ceretti\u001B[0m\n",
"Thought:\u001B[32;1m\u001B[1;3m```\n",
"{\n",
" \"action\": \"Calculator\",\n",
" \"action_input\": \"0.43 ^ 2\"\n",
"}\n",
"```\u001B[0m\n",
"Observation: \u001B[33;1m\u001B[1;3mAnswer: 0.18489999999999998\u001B[0m\n",
"Thought:\u001B[32;1m\u001B[1;3mFinal Answer: Vittoria Ceretti, 0.1849\u001B[0m\n",
"\n",
"\u001B[1m> Finished chain.\u001B[0m\n"
]
},
{
"data": {
"text/plain": "'Vittoria Ceretti, 0.1849'"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain_openai import ChatOpenAI\n",
"\n",
"chat_model = ChatOpenAI(model=\"gpt-3.5-turbo\", temperature=0)\n",
"agent = initialize_agent(tools, chat_model, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=True)\n",
"agent.run(\"谁是莱昂纳多·迪卡普里奥的女朋友?她现在年龄的0.43次方是多少?\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-29T11:55:04.054776900Z",
"start_time": "2024-03-29T11:54:52.682394100Z"
}
},
"id": "f8e5f3fdde98c5e2",
"execution_count": 10
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
308 changes: 308 additions & 0 deletions langchain/jupyter/agents/self_ask_with_searchTest.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
China won 103 gold medals at the 2023 Chengdu Universiade.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
China won 103 gold medals at the 2023 Chengdu Universiade.
704 changes: 704 additions & 0 deletions langchain/jupyter/autogpt/autogptTest.ipynb

Large diffs are not rendered by default.

547 changes: 547 additions & 0 deletions langchain/jupyter/chains/router_chainTest.ipynb

Large diffs are not rendered by default.

545 changes: 545 additions & 0 deletions langchain/jupyter/chains/sequential_chainTest.ipynb

Large diffs are not rendered by default.

1,834 changes: 1,834 additions & 0 deletions langchain/jupyter/chains/transform_chainTest.ipynb

Large diffs are not rendered by default.

915 changes: 915 additions & 0 deletions langchain/jupyter/data_connection/document_loaderTest.ipynb

Large diffs are not rendered by default.

1,088 changes: 1,088 additions & 0 deletions langchain/jupyter/data_connection/doucument_transformerTest.ipynb

Large diffs are not rendered by default.

216 changes: 216 additions & 0 deletions langchain/jupyter/data_connection/text_embeddingTest.ipynb

Large diffs are not rendered by default.

249 changes: 249 additions & 0 deletions langchain/jupyter/data_connection/vector_storesTest.ipynb

Large diffs are not rendered by default.

463 changes: 463 additions & 0 deletions langchain/jupyter/memory/memoryTest.ipynb

Large diffs are not rendered by default.

502 changes: 502 additions & 0 deletions langchain/jupyter/model_io/modelTest.ipynb

Large diffs are not rendered by default.

466 changes: 466 additions & 0 deletions langchain/jupyter/model_io/output_parserTest.ipynb

Large diffs are not rendered by default.

1,558 changes: 1,558 additions & 0 deletions langchain/jupyter/model_io/promptTest.ipynb

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions langchain/openai-translator/ai_translator/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "gpt-3.5-turbo"
input_file: "../tests/test.pdf"
output_file_format: "markdown"
source_language: "English"
target_language: "Chinese"
2 changes: 2 additions & 0 deletions langchain/openai-translator/ai_translator/main.py
Original file line number Diff line number Diff line change
@@ -17,4 +17,6 @@

# 实例化 PDFTranslator 类,并调用 translate_pdf() 方法
translator = PDFTranslator(config.model_name)
LOG.debug(config.input_file)
LOG.debug(config.output_file_format)
translator.translate_pdf(config.input_file, config.output_file_format, pages=None)
25 changes: 13 additions & 12 deletions langchain/openai-translator/tests/test_translated.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
测试数据
这个数据集包含了由OpenAI的AI语言模型ChatGPT提供的两个测试样本。
这些样本包括一个Markdown表格和一个英文文本段落,可以用来测试支持文本和表格格式的英译中翻译软件
这些样本包括一个Markdown表格和一个英文文本段落,可用于测试支持文本和表格格式的英文到中文翻译软件
文本测试
快速的棕色狐狸跳过懒狗。这个句子包含了英语字母表中的每个字母至少一次。句子是经常用来测试字体、键盘和其他与文本相关的工具的。除了英语,其他许多语言也有句子。由于语言的独特特点,有些句子更难构造。
快速的棕色狐狸跳过懒狗。这个句子包含了英语字母表中的每个字母至少一次。全字母句经常用于测试字体、键盘和其他文本相关工具。除了英语,许多其他语言也有全字母句。由于语言的独特特点,有些全字母句更难构造。
表格测试

| 水果 | 颜色 | 价格(美元) |
| --- | --- | --- |
| 苹果 | 红色 | 1.2 |
| 香蕉 | 黄色 | 0.5 |
| 橙子 | 橙色 | 0.8 |
| 草莓 | 红色 | 2.5 |
| 蓝莓 | 蓝色 | 3.0 |
| 猕猴桃 | 绿色 | 1.0 |
| 芒果 | 橙色 | 1.5 |
| 葡萄 | 紫色 | 2.00 |
| 水果,颜色,价格(美元) |
| --- |
| 苹果,红色,1.2 |
| 香蕉,黄色,0.5 |
| 橙子,橙色,0.8 |
| 草莓,红色,2.5 |
| 蓝莓,蓝色,3.0 |
| 猕猴桃,绿色,1.0 |
| 芒果,橙色,1.5 |
| 葡萄,紫色,2.00 |

---

8 changes: 8 additions & 0 deletions openai-translator/.idea/.gitignore
17 changes: 17 additions & 0 deletions openai-translator/.idea/inspectionProfiles/Project_Default.xml
7 changes: 7 additions & 0 deletions openai-translator/.idea/misc.xml
8 changes: 8 additions & 0 deletions openai-translator/.idea/modules.xml
10 changes: 10 additions & 0 deletions openai-translator/.idea/openai-translator.iml
6 changes: 6 additions & 0 deletions openai-translator/.idea/vcs.xml
8 changes: 8 additions & 0 deletions openai-translator/ai_translator/.idea/.gitignore
9 changes: 5 additions & 4 deletions openai-translator/ai_translator/book/content.py
Original file line number Diff line number Diff line change
@@ -26,7 +26,7 @@ def check_translation_type(self, translation):
return True
elif self.content_type == ContentType.TABLE and isinstance(translation, list):
return True
elif self.content_type == ContentType.IMAGE and isinstance(translation, PILImage.Image):
elif self.content_type == ContentType.IMAGE and isinstance(translation, str):
return True
return False

@@ -47,13 +47,14 @@ def set_translation(self, translation, status):
raise ValueError(f"Invalid translation type. Expected str, but got {type(translation)}")

LOG.debug(translation)
LOG.warning(type(translation))
# Convert the string to a list of lists
table_data = [row.strip().split() for row in translation.strip().split('\n')]
LOG.debug(table_data)
# Create a DataFrame from the table_data
translated_df = pd.DataFrame(table_data[1:], columns=table_data[0])
LOG.debug(translated_df)
self.translation = translated_df
#translated_df = pd.DataFrame(table_data[1:], columns=table_data[0])
#LOG.debug(translated_df)
self.translation = translation#translated_df
self.status = status
except Exception as e:
LOG.error(f"An error occurred during table translation: {e}")
13 changes: 13 additions & 0 deletions openai-translator/ai_translator/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
OpenAIModel:
model: "gpt-3.5-turbo"
api_key: "your_openai_api_key"

GLMModel:
model_url: "gpt-3.5-turbo"
timeout: 300

common:
book: "tests/test.pdf"
file_format: "markdown"
pre_name_write: "tests/img/"
pre_name_read: "img/"
115 changes: 115 additions & 0 deletions openai-translator/ai_translator/flask_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import sys
import os
from argparse import Namespace

from model import OpenAIModel

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from flask import Flask, request, send_file, jsonify
from utils import ArgumentParser, LOG, ConfigLoader
from translator import PDFTranslator

app = Flask(__name__)

TEMP_FILE_DIR = "flask_temps/"

@app.route('/translate', methods=['POST'])
def translate():
try:
#input_file = request.files['input_file']
source_text = request.form.get['source_text', 'source_text']
source_language = request.form.get('source_language', 'English')
target_language = request.form.get('target_language', 'Chinese')

translation_text = Translator.translate_to_text(source_text=source_text, target_language=target_language)
response = {
'status': 'success',
'message': translation_text
}
return jsonify(response), 200

except Exception as e:
response = {
'status': 'error',
'message': str(e)
}
return jsonify(response), 400

@app.route('/translation', methods=['POST'])
def translation():
try:
input_file = request.files['input_file']
source_language = request.form.get('source_language', 'English')
target_language = request.form.get('target_language', 'Chinese')

LOG.debug(f"[input_file]\n{input_file}")
LOG.debug(f"[input_file.filename]\n{input_file.filename}")

if input_file and input_file.filename:
# # 创建临时文件
input_file_path = TEMP_FILE_DIR + input_file.filename
LOG.debug(f"[input_file_path]\n{input_file_path}")

input_file.save(input_file_path)

# 调用翻译函数
output_file_path = Translator.translate_pdf(
input_file=input_file_path,
source_language=source_language,
target_language=target_language)

# 移除临时文件
# os.remove(input_file_path)

# 构造完整的文件路径
output_file_path = os.getcwd() + "/" + output_file_path
LOG.debug(output_file_path)

# 返回翻译后的文件
return send_file(output_file_path, as_attachment=True)
except Exception as e:
response = {
'status': 'error',
'message': str(e)
}
return jsonify(response), 400



def initialize_translator():
# 解析命令行
# argument_parser = ArgumentParser()
# args = argument_parser.parse_arguments()

args = Namespace(config='config.yaml', model_type='OpenAIModel', glm_model_url=None, timeout=None,
openai_model='gpt-3.5-turbo', openai_api_key=None, book='tests/test.pdf', file_format='markdown',
pre_name_write='tests/img/', pre_name_read='img/')

config_loader = ConfigLoader(args.config)

config = config_loader.load_config()

model_name = args.openai_model if args.openai_model else config['OpenAIModel']['model']
api_key = args.openai_api_key if args.openai_api_key else config['OpenAIModel']['api_key']
model = OpenAIModel(model=model_name, api_key=api_key)

pdf_file_path = args.book if args.book else config['common']['book']
global file_format
file_format = args.file_format if args.file_format else config['common']['file_format']

global pre_name_write
pre_name_write = args.pre_name_write if args.pre_name_write else config['common']['pre_name_write']
global pre_name_read
pre_name_read = args.pre_name_read if args.pre_name_read else config['common']['pre_name_read']

# 实例化 PDFTranslator 类,并调用 translate_pdf() 方法
global Translator
Translator = PDFTranslator(model)


if __name__ == "__main__":
# 初始化 translator
initialize_translator()
# 启动 Flask Web Server
app.run(host="0.0.0.0", port=5000, debug=True)
70 changes: 70 additions & 0 deletions openai-translator/ai_translator/gradio_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import sys
import os
from argparse import Namespace

import gradio as gr

from model import OpenAIModel

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from utils import ArgumentParser, LOG, ConfigLoader
from translator import PDFTranslator

def translation(input_file, source_language, target_language):
LOG.debug(f"[翻译任务]\n源文件:{input_file.name}\n源语言:{source_language}\n目标语言:{target_language}")

output_file_path = Translator.translate_pdf(input_file.name, pre_name_write, pre_name_read, file_format)

LOG.info(f"返回路径: {output_file_path}")
return output_file_path

def launcher_gradio():
iface = gr.Interface(
fn=translation,
title="OpenAI-Translator v1.0(PDF电子书翻译工具)",
inputs=[
gr.File(label="上传PDF文件"),
gr.Textbox(label="源语言(默认:英文)", placeholder="English", value="English"),
gr.Textbox(label="目标语言(默认:中文)", placeholder="Chinese", value="Chinese")
],
outputs=[
gr.File(label="下载翻译文件")
]
)
iface.launch(share=True, server_name="0.0.0.0", server_port=10001)

def initialize_translator():
#argument_parser = ArgumentParser()
#args = argument_parser.parse_arguments()

args = Namespace(config='config.yaml', model_type='OpenAIModel', glm_model_url=None, timeout=None,
openai_model='gpt-3.5-turbo', openai_api_key=None, book='tests/test.pdf', file_format='markdown',
pre_name_write='tests/img/', pre_name_read='img/')

config_loader = ConfigLoader(args.config)

config = config_loader.load_config()

model_name = args.openai_model if args.openai_model else config['OpenAIModel']['model']
api_key = args.openai_api_key if args.openai_api_key else config['OpenAIModel']['api_key']
model = OpenAIModel(model=model_name, api_key=api_key)

pdf_file_path = args.book if args.book else config['common']['book']
global file_format
file_format = args.file_format if args.file_format else config['common']['file_format']

global pre_name_write
pre_name_write = args.pre_name_write if args.pre_name_write else config['common']['pre_name_write']
global pre_name_read
pre_name_read = args.pre_name_read if args.pre_name_read else config['common']['pre_name_read']

# 实例化 PDFTranslator 类,并调用 translate_pdf() 方法
global Translator
Translator = PDFTranslator(model)
# translator.translate_pdf(pdf_file_path, pre_name_write, pre_name_read, file_format)


if __name__ == "__main__":
initialize_translator()
launcher_gradio()
14 changes: 11 additions & 3 deletions openai-translator/ai_translator/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import os
from argparse import Namespace

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

@@ -8,8 +9,12 @@
from translator import PDFTranslator

if __name__ == "__main__":
argument_parser = ArgumentParser()
args = argument_parser.parse_arguments()
#argument_parser = ArgumentParser()
#args = argument_parser.parse_arguments()

args = Namespace(config='config.yaml', model_type='OpenAIModel', glm_model_url=None, timeout=None,
openai_model='gpt-3.5-turbo', openai_api_key=None, book='tests/test.pdf', file_format='markdown', pre_name_write='tests/img/', pre_name_read='img/')
print(f"args = {args}")
config_loader = ConfigLoader(args.config)

config = config_loader.load_config()
@@ -22,6 +27,9 @@
pdf_file_path = args.book if args.book else config['common']['book']
file_format = args.file_format if args.file_format else config['common']['file_format']

pre_name_write = args.pre_name_write if args.pre_name_write else config['common']['pre_name_write']
pre_name_read = args.pre_name_read if args.pre_name_read else config['common']['pre_name_read']

# 实例化 PDFTranslator 类,并调用 translate_pdf() 方法
translator = PDFTranslator(model)
translator.translate_pdf(pdf_file_path, file_format)
translator.translate_pdf(pdf_file_path, pre_name_write, pre_name_read, file_format)
2 changes: 1 addition & 1 deletion openai-translator/ai_translator/model/model.py
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@ def make_text_prompt(self, text: str, target_language: str) -> str:
return f"翻译为{target_language}{text}"

def make_table_prompt(self, table: str, target_language: str) -> str:
return f"翻译为{target_language},保持间距(空格,分隔符),以表格形式返回:\n{table}"
return f"所有单词翻译为{target_language},保持间距(空格,分隔符),以表格形式返回:\n{table}"

def translate_prompt(self, content, target_language: str) -> str:
if content.content_type == ContentType.TEXT:
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added openai-translator/ai_translator/tests/test.pdf
Binary file not shown.
18 changes: 18 additions & 0 deletions openai-translator/ai_translator/tests/test_translated.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
测试数据
这个数据集包含了由OpenAI的AI语言模型ChatGPT提供的两个测试样本。
这些样本包括一个markdown表格和一个英文文本段落,可以用来测试支持文本和表格格式的英译中翻译软件。
文本测试
快速的棕色狐狸跳过了懒惰的狗。这个全字母句包含了英语字母表中的每一个字母。全字母句经常用来测试字体、键盘和其他与文本有关的工具。除了英语,还有许多其他语言的全字母句。一些全字母句由于语言的独特特点更难构建。
表格测试

水果 颜色 价格(美元)
苹果 红色 1.20
香蕉 黄色 0.50
橙子 橙色 0.80
草莓 红色 2.50
蓝莓 蓝色 3.00
奇异果 绿色 1.00
芒果 橙色 1.50
葡萄 紫色 2.00

![img/pdf_img0.png](img/pdf_img0.png)
17 changes: 16 additions & 1 deletion openai-translator/ai_translator/translator/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@ class PDFParser:
def __init__(self):
pass

def parse_pdf(self, pdf_file_path: str, pages: Optional[int] = None) -> Book:
def parse_pdf(self, pdf_file_path: str, pre_name_write: str, pages: Optional[int] = None) -> Book:
book = Book(pdf_file_path)

with pdfplumber.open(pdf_file_path) as pdf:
@@ -27,6 +27,10 @@ def parse_pdf(self, pdf_file_path: str, pages: Optional[int] = None) -> Book:
# Store the original text content
raw_text = pdf_page.extract_text()
tables = pdf_page.extract_tables()
imgs = pdf_page.images




# Remove each cell's content from the original text
for table_data in tables:
@@ -45,6 +49,16 @@ def parse_pdf(self, pdf_file_path: str, pages: Optional[int] = None) -> Book:
page.add_content(text_content)
LOG.debug(f"[raw_text]\n {cleaned_raw_text}")

if imgs:
for img in imgs:
bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
cropped_page = pdf_page.crop(bbox)
im = cropped_page.to_image(antialias=True)
imgName = "pdf_img0.png"
im.save(f"{pre_name_write}pdf_img0.png")
img_content = Content(content_type=ContentType.IMAGE, original=imgName, translation=imgName)
page.add_content(img_content)
LOG.debug(f"[image]\n{img_content.translation}")


# Handling tables
@@ -55,4 +69,5 @@ def parse_pdf(self, pdf_file_path: str, pages: Optional[int] = None) -> Book:

book.add_page(page)


return book
34 changes: 25 additions & 9 deletions openai-translator/ai_translator/translator/pdf_translator.py
Original file line number Diff line number Diff line change
@@ -3,24 +3,40 @@
from translator.pdf_parser import PDFParser
from translator.writer import Writer
from utils import LOG
from book import ContentType


class PDFTranslator:
def __init__(self, model: Model):
self.model = model
self.pdf_parser = PDFParser()
self.writer = Writer()

def translate_pdf(self, pdf_file_path: str, file_format: str = 'PDF', target_language: str = '中文', output_file_path: str = None, pages: Optional[int] = None):
self.book = self.pdf_parser.parse_pdf(pdf_file_path, pages)
def translate_pdf(self, pdf_file_path: str, pre_name_write: str, pre_name_read: str,file_format: str = 'PDF',target_language: str = '中文', output_file_path: str = None, pages: Optional[int] = None):
self.book = self.pdf_parser.parse_pdf(pdf_file_path, pre_name_write, pages)

LOG.debug(self.book.pages)
for page_idx, page in enumerate(self.book.pages):
for content_idx, content in enumerate(page.contents):
prompt = self.model.translate_prompt(content, target_language)
LOG.debug(prompt)
translation, status = self.model.make_request(prompt)
LOG.info(translation)
if content.content_type != ContentType.IMAGE:
prompt = self.model.translate_prompt(content, target_language)
LOG.debug(prompt)
translation, status = self.model.make_request(prompt)
LOG.info(translation)

# Update the content in self.book.pages directly
self.book.pages[page_idx].contents[content_idx].set_translation(translation, status)
elif content.content_type == ContentType.IMAGE:
translation, status = content.translation, True
self.book.pages[page_idx].contents[content_idx].set_translation(translation, status)

# Update the content in self.book.pages directly
self.book.pages[page_idx].contents[content_idx].set_translation(translation, status)

self.writer.save_translated_book(self.book, output_file_path, file_format)

output_file_path = self.writer.save_translated_book(self.book, pre_name_read, output_file_path, file_format)
return output_file_path

def translate_to_text(self, source_text: str, target_language: str) -> str:
prompt = self.model.translate_prompt(source_text, target_language)
LOG.debug(prompt)
translation, status = self.model.make_request(prompt)
return translation
34 changes: 27 additions & 7 deletions openai-translator/ai_translator/translator/writer.py
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@
from reportlab.platypus import (
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
)
import pandas as pd

from book import Book, ContentType
from utils import LOG
@@ -14,14 +15,16 @@ class Writer:
def __init__(self):
pass

def save_translated_book(self, book: Book, output_file_path: str = None, file_format: str = "PDF"):
def save_translated_book(self, book: Book, pre_name_read: str,output_file_path: str = None, file_format: str = "PDF"):
if file_format.lower() == "pdf":
self._save_translated_book_pdf(book, output_file_path)
output_file_path = self._save_translated_book_pdf(book, output_file_path)
elif file_format.lower() == "markdown":
self._save_translated_book_markdown(book, output_file_path)
output_file_path = self._save_translated_book_markdown(book, pre_name_read, output_file_path)
else:
raise ValueError(f"Unsupported file format: {file_format}")

return output_file_path

def _save_translated_book_pdf(self, book: Book, output_file_path: str = None):
if output_file_path is None:
output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.pdf')
@@ -76,7 +79,7 @@ def _save_translated_book_pdf(self, book: Book, output_file_path: str = None):
doc.build(story)
LOG.info(f"翻译完成: {output_file_path}")

def _save_translated_book_markdown(self, book: Book, output_file_path: str = None):
def _save_translated_book_markdown(self, book: Book, pre_name_read: str, output_file_path: str = None):
if output_file_path is None:
output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.md')

@@ -95,14 +98,31 @@ def _save_translated_book_markdown(self, book: Book, output_file_path: str = Non
elif content.content_type == ContentType.TABLE:
# Add table to the Markdown file
table = content.translation
#df = pd.DataFrame(table)
#markdown_table = table.to_markdown()
output_file.write(table + '\n\n')
'''
LOG.warning(table)
header = '| ' + ' | '.join(str(column) for column in table.columns) + ' |' + '\n'
separator = '| ' + ' | '.join(['---'] * len(table.columns)) + ' |' + '\n'
# body = '\n'.join(['| ' + ' | '.join(row) + ' |' for row in table.values.tolist()]) + '\n\n'
body = '\n'.join(['| ' + ' | '.join(str(cell) for cell in row) + ' |' for row in table.values.tolist()]) + '\n\n'
LOG.warning(body)
output_file.write(header + separator + body)
'''
elif content.content_type == ContentType.IMAGE:
LOG.debug( content.translation )
image = pre_name_read + content.translation
markdown_text = f"![{image}]({image})"
LOG.debug(image)
LOG.debug(markdown_text)
output_file.write(markdown_text)



# Add a page break (horizontal rule) after each page except the last one
if page != book.pages[-1]:
output_file.write('---\n\n')
#if page != book.pages[-1]:
# output_file.write('---\n\n')

LOG.info(f"翻译完成: {output_file_path}")
LOG.info(f"翻译完成: {output_file_path}")
return output_file_path
24 changes: 15 additions & 9 deletions openai-translator/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
openai==1.14.2
pdfplumber
simplejson
requests
PyYAML
pillow
reportlab
pandas
loguru
openai==1.20.0
langchain==0.1.13
langchain-openai==0.1.1
langchain-experimental==0.0.55
chromadb==0.4.24
faiss-cpu==1.8.0
gradio==4.20.0
unstructured==0.12.6
google-search-results==2.4.2
tiktoken==0.6.0
matplotlib
plotly
numpy
scikit-learn
pandas
15 changes: 15 additions & 0 deletions requeirmentsT
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
openai==1.14.2
langchain==0.1.13
langchain-openai==0.1.1
langchain-experimental==0.0.55
chromadb==0.4.24
faiss-cpu==1.8.0
gradio==4.20.0
unstructured==0.12.6
google-search-results==2.4.2
tiktoken==0.6.0
matplotlib
plotly
numpy
scikit-learn
pandas