forked from HL7/plain-language
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_repo.ts
245 lines (203 loc) · 8.89 KB
/
process_repo.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import { promises as fs } from 'fs';
import path from 'path';
import { VertexAI } from "@google-cloud/vertexai";
import Anthropic from "@anthropic-ai/sdk";
import type { TextBlock } from '@anthropic-ai/sdk/resources/index.mjs';
import { exec } from 'child_process';
const repoPath = process.argv[2];
if (!repoPath) {
console.error('Please provide the repository path as an argument.');
process.exit(1);
}
const repoName = repoPath.split('/').filter(Boolean).pop();
// Load the repository to package mapping
let repoToPackage: Record<string, string> = {};
try {
const mapping = await fs.readFile('repository_to_package.json', 'utf-8');
repoToPackage = JSON.parse(mapping);
} catch (error) {
console.error('Error loading repository_to_package.json. Make sure to run repo-to-package.ts first.');
process.exit(1);
}
const packageId = repoToPackage[repoName] || repoName;
await fs.mkdir('summaries', { recursive: true });
async function processRepo(repoPath: string) {
const inputDir = path.join(repoPath, 'input');
const outputFile = path.join('summaries', `${packageId}.md`);
try {
await fs.access(outputFile);
console.log(`Summary ${outputFile} for ${packageId} already exists. Skipping this repo.`);
return;
} catch (error) {
// File doesn't exist, continue processing
}
try {
await fs.access(inputDir);
} catch (error) {
console.log(`No 'input' folder found in ${repoPath}. Skipping this repo.`);
return;
}
const fileContent = await concatenateFiles(inputDir);
await generateSummary(fileContent);
// Add this line to run the post-processing step
// await runPostProcessing();
}
async function concatenateFiles(dir: string): Promise<string> {
const allowedExtensions = ['.txt', '.md', '.plantuml', '.fsh'];
let allFiles: { path: string; depth: number; name: string; isIndex: boolean; isPageContent: boolean }[] = [];
async function collectFiles(currentDir: string) {
const entries = await fs.readdir(currentDir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(currentDir, entry.name);
if (entry.isDirectory()) {
await collectFiles(fullPath);
} else if (entry.isFile() && allowedExtensions.includes(path.extname(entry.name)) && entry.name !== 'ignoreWarnings.txt') {
const relativePath = path.relative(dir, fullPath);
const depth = relativePath.split(path.sep).length;
const isIndex = entry.name.toLowerCase().includes('index');
const isPageContent = relativePath.toLowerCase().includes('pagecontent');
allFiles.push({ path: fullPath, depth, name: entry.name, isIndex, isPageContent });
}
}
}
await collectFiles(dir);
allFiles.sort((a, b) => {
if (a.isPageContent !== b.isPageContent) return a.isPageContent ? -1 : 1;
if (a.depth !== b.depth) return a.depth - b.depth;
if (a.isIndex !== b.isIndex) return a.isIndex ? -1 : 1;
return a.name.length - b.name.length;
});
let content = '';
const maxSize = 400 * 1024; // 400KB in bytes
let currentSize = 0;
for (const file of allFiles) {
const fileContent = await fs.readFile(file.path, 'utf-8');
const sourceContent = `<source path="${file.path}">\n${fileContent}</source>\n`;
const sourceSize = Buffer.byteLength(sourceContent, 'utf-8');
if (currentSize + sourceSize > maxSize) {
console.log(`Skipping ${file.path} as it would exceed the 400KB limit.`);
continue;
}
content += sourceContent;
currentSize += sourceSize;
if (currentSize >= maxSize) {
console.log(`Reached 400KB limit. Stopping file processing.`);
break;
}
}
console.log(`Total content size: ${currentSize / 1024} KB`);
return content;
}
async function generateSummary(content: string) {
const vertexAI = new VertexAI({
project: "fhir-org-starter-project",
location: "us-central1",
});
const anthropic = new Anthropic();
const generativeModel = vertexAI.getGenerativeModel({
model: "gemini-pro-experimental",
generationConfig: {
temperature: 0.7,
}
});
const promptInstructions = `# FHIR IG Analysis
Given the FHIR Implementation Guide (IG) source files above, provide a structured analysis addressing the following questions:
1. What is this IG trying to achieve, in what context, and for whom? Explain its objectives in simple terms.
2. How does this IG improve upon previous approaches? (Use only information from the input files; avoid speculation.)
3. What are the key features and technical approaches of this IG?
4. How does this IG relate to broader healthcare standards and regulations?
5. Who are the primary users or beneficiaries of this IG, including patients if applicable?
Provide concise, factual responses to each question based on the content of the IG. Aim for clarity and precision in your analysis. Begin with "# $igName: Analysis" and do not output anything other than the analysis.`;
const request = {
systemInstruction: "You are a health information technology expert.",
contents: [
{ role: 'user', parts: [{ text: content + "\n\n" + promptInstructions }] },
]
};
await fs.mkdir('prompts', { recursive: true });
await fs.writeFile(path.join('prompts', `${packageId}.txt`), JSON.stringify(request, null, 2));
try {
const response = await generativeModel.generateContent(request);
let analysis = response.response.candidates?.[0].content.parts[0].text || "";
console.log('Initial Analysis:', analysis);
await fs.mkdir('analysis', { recursive: true });
await fs.writeFile(path.join('analysis', `${packageId}.md`), analysis);
const basicGuidelines = `
1. Explain the IG's purpose, country of use (if applicable), and context of use / use cases, and key features / how it works. Avoid explaining what standards are in general.
2. Write ~200 words in short paragraphs for a general audience.
3. Use clear, jargon-free language.
4. Write in third-person perspective.
5. Maintain an objective, informative tone.
6. Present information factually.
7. Highlight any key stakeholder benefits.
8. Mention how the IG relates to other standards or regulations, if this is direct and relevant. Otherwise omit this.
9. Avoid promotional language or unverified claims.
`
const revisionGuidelines = `
Please revise this summary to adhere to the following revision guideline:
- Rather than referring to an "IG" or "Implementation Guide", just call it a "standard".
- Remove any explanation that healthcare standards are like a common language or that they help computers talk to each other. That's common knowledge.
- Remove any explanation of what FHIR is; do not expand the acronym FHIR; just call it FHIR.
- Remove any explanation of what an API is; do not expand the acronym API; just call it an API.
- Remove any explanation of what an EHR is; do not expand the acronym EHR; just call it an EHR.
- Eliminate any speculative or indirect information about benefits.
- Remove any redundancy in the summary.
- Remove any mention of things you don't know or aren't sure about this IG
- Remove any mention that this IG builds on FHIR; that is common knowledge.
`
const refinementPrompt = `
Here is the analysis of a FHIR Implementation Guide:
${analysis}
Use the analysis to create a plain language summary of the guide that adheres to these guidelines:
${basicGuidelines}
${revisionGuidelines}
Provide only the refined summary as your response, without additional explanations or comments.`;
const msg = await anthropic.messages.create({
model: "claude-3-5-sonnet-20240620",
max_tokens: 1182,
temperature: 0.6,
system: "You are a skilled communicator with expertise in health information technology and a knack for clear, concise writing.",
messages: [
{
"role": "user",
"content": [
{
"type": "text",
"text": `${refinementPrompt}\n\n`
}
]
},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "This standard"
}
]
}
]
});
await fs.writeFile(path.join('prompts', `${packageId}-refinement.txt`), JSON.stringify(refinementPrompt, null, 2));
const refinementResponse = "This standard" + (msg.content[0] as TextBlock).text!;
console.log("Refined as", refinementResponse);
await fs.writeFile(path.join('summaries', `${packageId}.md`), refinementResponse);
} catch (error) {
console.error('Error generating or refining summary:', error);
}
}
async function runPostProcessing() {
return new Promise<void>((resolve, reject) => {
exec('bun run add_links.ts', (error, stdout, stderr) => {
if (error) {
console.error('Error running post-processing:', error);
reject(error);
} else {
console.log('Post-processing completed successfully');
console.log(stdout);
resolve();
}
});
});
}
processRepo(repoPath).catch(console.error);