Skip to content

Commit

Permalink
Imports can have additional senses (#404)
Browse files Browse the repository at this point in the history
* use the same column names for all other example sentences even with multiple senses

* allow multiples sentences linked to the same sense

* check for dev server when importing entries

* type import row everywhere, add speaker right away when new speaker encountered
---------

Co-authored-by: livingtongues <[email protected]>
Co-authored-by: Jacob Bowdoin <[email protected]>
  • Loading branch information
3 people authored Jul 11, 2024
1 parent 4d584dc commit 3f4890a
Show file tree
Hide file tree
Showing 47 changed files with 2,832 additions and 641 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ jobs:
- run: pnpm install

- name: Run ESLint on changed files
uses: tj-actions/eslint-changed-files@v24
uses: tj-actions/eslint-changed-files@v25
with:
config_path: eslint.config.js
# escape_paths: false - needed if using SvelteKit parenthesis in routes
level: error
file_extensions: |
**/*.ts
**/*.js
Expand Down
2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@
"a11y-no-noninteractive-element-interactions": "ignore",
},

"comments.visible": false,

"deno.enable": true,
"deno.lint": true,
"deno.enablePaths": [
Expand Down
82 changes: 41 additions & 41 deletions FLEx.model.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
interface Entry {
id: string;
lexemeForm: MultiString; // thanks for this inspiration - we started with just a lexeme string field, and then people asked for more orthographies and we made them second class citizens as optional alternate orthographies. This accomplishes the same purpose as the multi-string here but it's not as elegant and has pain points. For example, once someone decided they wanted to make an alternate orthography the main orthography, but they couldn't. So I don't like it our current model and will use a MultiString after our migration.
citationForm: MultiString; // Am I correct that citation form is a convention from a world where print is the only medium? We don't have this field. In my opinion, if you have a lexeme that is important enough to add and gloss, like "radii" and the citation form is "radius", then in a digital dictionary, these belong as two separate entries with a relationship from radii to radius. Not sure what the relationship would be called but something like "child-citation" indicating that the "radii" entry is really an offshoot of the base word "radius". But at the end of the day we do have a very simple print view, so print conventions are still in view but in our world they are second-class citizens. Web usage with easy bouncing between entries via links is first-class. However, we don't have a system for relationships yet. That will be a further down the road benefit of our migration. In that it will be easy to indicate relationships between entries. For now we do have a few additional fields users can use to add some basic info like a "plural_form" field, and a deprecated "variant" field. I don't really like these but we have them at the moment.
literalMeaning: MultiString; // What is this field for? We have nothing like it. Meaning is based on sense and you already have gloss and definition fields there.
senses: Sense[];
note: MultiString; // our notes field is just a string - is this going to cause grief when importing flex data. Is this designed this way for people to write notes in whatever writing system they like and be able to have fonts applied appropriately?
id: string
lexemeForm: MultiString // thanks for this inspiration - we started with just a lexeme string field, and then people asked for more orthographies and we made them second class citizens as optional alternate orthographies. This accomplishes the same purpose as the multi-string here but it's not as elegant and has pain points. For example, once someone decided they wanted to make an alternate orthography the main orthography, but they couldn't. So I don't like it our current model and will use a MultiString after our migration.
citationForm: MultiString // Am I correct that citation form is a convention from a world where print is the only medium? We don't have this field. In my opinion, if you have a lexeme that is important enough to add and gloss, like "radii" and the citation form is "radius", then in a digital dictionary, these belong as two separate entries with a relationship from radii to radius. Not sure what the relationship would be called but something like "child-citation" indicating that the "radii" entry is really an offshoot of the base word "radius". But at the end of the day we do have a very simple print view, so print conventions are still in view but in our world they are second-class citizens. Web usage with easy bouncing between entries via links is first-class. However, we don't have a system for relationships yet. That will be a further down the road benefit of our migration. In that it will be easy to indicate relationships between entries. For now we do have a few additional fields users can use to add some basic info like a "plural_form" field, and a deprecated "variant" field. I don't really like these but we have them at the moment.
literalMeaning: MultiString // What is this field for? We have nothing like it. Meaning is based on sense and you already have gloss and definition fields there.
senses: Sense[]
note: MultiString // our notes field is just a string - we are going to move to using MultiString to allow for different analysis writing systems. Needed when importing flex data.
// Additional fields we have
// phonetic?: string;
// morphology?: string;
Expand All @@ -16,63 +16,63 @@ interface Entry {
}

interface Sense {
id: string;
gloss: MultiString;
definition: MultiString; // we have this field used in our first dictionary but we don't show the field when it is empty (ie - we don't encourage it's use and just use glosses but that could change)
partOfSpeech: string; // this is an array because some entries serve as multiple parts of speech, we have a specific set which are keys that are translated in the UI (eg. "n" -> "noun" in English / "sustantivo" in Spanish)
semanticDomain: string[]; // we have a specific set which are keys that are translated in the UI (it's a majorly simplified system modeled after SemDom with some adjustments) a universal set of domains is nice for cross-linguistic work but doesn't always fit the semantic categories of a language so future growth in our semantic domains field could go a lot of different directions depending on needs, like accepting different systems (ie -SEMDOM) or letting a dictionary itself set up custom domains. We also plan to introduce tags, which would be multi-purpose for many different applications and that may negate the need for a dictionary to create their own domains.
id: string
gloss: MultiString
definition: MultiString // we have this field used in our first dictionary but we don't show the field when it is empty (ie - we don't encourage it's use and just use glosses but that could change)
partOfSpeech: string // this is an array because some entries serve as multiple parts of speech, we have a specific set which are keys that are translated in the UI (eg. "n" -> "noun" in English / "sustantivo" in Spanish)
semanticDomain: string[] // we have a specific set which are keys that are translated in the UI (it's a majorly simplified system modeled after SemDom with some adjustments) a universal set of domains is nice for cross-linguistic work but doesn't always fit the semantic categories of a language so future growth in our semantic domains field could go a lot of different directions depending on needs, like accepting different systems (ie -SEMDOM) or letting a dictionary itself set up custom domains. We also plan to introduce tags, which would be multi-purpose for many different applications and that may negate the need for a dictionary to create their own domains.
// write_in_semantic_domains?: string[] // used to support legacy dictionaries, and obviously not translated. We show these and let users delete these and swap them out for the new system, but we don't allow editing or adding.
exampleSentences: ExampleSentence[];
exampleSentences: ExampleSentence[]
// noun_class?: string; additional field we have
}

interface ExampleSentence { // upgrading these to be first class citizens called Sentence
id: string;
sentence: MultiString;
translation: MultiString;
reference: string; // further fields like this haven't been thought through yet but there's room to grow
id: string
sentence: MultiString
translation: MultiString
reference: string // further fields like this haven't been thought through yet but there's room to grow
}

interface MultiString {
values: Record<WritingSystemId, string>; // Our current use of something that's like MultiString doesn't nest values underneath a "values" key but it works the same way. It's just Record<bcp_string, string> as in `gloss: { "en": "dog", "es": "perro" }` - is there a good reason to nest under values beside leaving room for adding notes or something in the future? What is the reason for the "values" key? As I expand our use of this MultiString idea, I'd like to know more about your experience here.
values: Record<WritingSystemId, string> // Our current use of something that's like MultiString doesn't nest values underneath a "values" key but it works the same way. It's just Record<bcp_string, string> as in `gloss: { "en": "dog", "es": "perro" }` - is there a good reason to nest under values beside leaving room for adding notes or something in the future? What is the reason for the "values" key? As I expand our use of this MultiString idea, I'd like to know more about your experience here.
}

interface WritingSystem {
id: WritingSystemId;
name: string;
abbreviation: string;
font: string;
id: WritingSystemId
name: string
abbreviation: string
font: string
}

interface WritingSystems {
analysis: WritingSystem[]; // let's pretend I'm studying a Native American language. This could be English and Spanish for example...
vernacular: WritingSystem[]; // and this might be Latin script and a native script?
analysis: WritingSystem[] // let's pretend I'm studying a Native American language. This could be English and Spanish for example...
vernacular: WritingSystem[] // and this might be Latin script and a native script?
}

type WritingSystemId = string;
type WritingSystemId = string

export interface ILexboxApiHub {
GetWritingSystems(): Promise<WritingSystems>;
GetEntries(options: QueryOptions): Promise<Entry[]>;
SearchEntries(query: string, options: QueryOptions): Promise<Entry[]>;
GetEntry(id: string): Promise<Entry>;
CreateEntry(entry: Entry): Promise<Entry>;
UpdateEntry(id: string, update: JsonOperation[]): Promise<Entry>;
DeleteEntry(id: string): Promise<void>;
CreateSense(entryId: string, sense: Sense): Promise<Sense>;
UpdateSense(entryId: string, senseId: string, update: JsonOperation[]): Promise<Sense>;
DeleteSense(entryId: string, senseId: string): Promise<void>;
CreateExampleSentence(entryId: string, senseId: string, exampleSentence: ExampleSentence): Promise<ExampleSentence>;
UpdateExampleSentence(entryId: string, senseId: string, exampleSentenceId: string, update: JsonOperation[]): Promise<ExampleSentence>;
DeleteExampleSentence(entryId: string, senseId: string, exampleSentenceId: string): Promise<void>;
GetWritingSystems: () => Promise<WritingSystems>
GetEntries: (options: QueryOptions) => Promise<Entry[]>
SearchEntries: (query: string, options: QueryOptions) => Promise<Entry[]>
GetEntry: (id: string) => Promise<Entry>
CreateEntry: (entry: Entry) => Promise<Entry>
UpdateEntry: (id: string, update: JsonOperation[]) => Promise<Entry>
DeleteEntry: (id: string) => Promise<void>
CreateSense: (entryId: string, sense: Sense) => Promise<Sense>
UpdateSense: (entryId: string, senseId: string, update: JsonOperation[]) => Promise<Sense>
DeleteSense: (entryId: string, senseId: string) => Promise<void>
CreateExampleSentence: (entryId: string, senseId: string, exampleSentence: ExampleSentence) => Promise<ExampleSentence>
UpdateExampleSentence: (entryId: string, senseId: string, exampleSentenceId: string, update: JsonOperation[]) => Promise<ExampleSentence>
DeleteExampleSentence: (entryId: string, senseId: string, exampleSentenceId: string) => Promise<void>
}

interface QueryOptions {
order: string;
count: number;
offset: number;
order: string
count: number
offset: number
}

interface JsonOperation {
do_no_know_yet: string;
do_no_know_yet: string
}
4 changes: 3 additions & 1 deletion eslint.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ export default antfu(
files: ['**/*.test.ts'],
rules: {
'test/consistent-test-it': ['error', { fn: 'test' }],
'test/no-commented-out-tests': 'error',
'test/no-disabled-tests': 'error',
'test/consistent-test-filename': 'error',
'test/expect-expect': 'error',
Expand All @@ -71,6 +70,7 @@ export default antfu(
'test/prefer-to-have-length': 'error',
'test/valid-describe-callback': 'error',
'test/valid-expect': 'error',
'test/no-commented-out-tests': 'warn',
},
},
{
Expand All @@ -87,6 +87,8 @@ export default antfu(
'no-console': 'off',
'ts/no-unused-vars': 'off',
'ts/no-var-requires': 'off',
'node/prefer-global/process': 'off',
'unused-imports/no-unused-vars': 'off',
},
},
{
Expand Down
1 change: 1 addition & 0 deletions packages/scripts/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ logs
service-account*
.env
sheets-viewer-SA.json
.env.supabase
50 changes: 24 additions & 26 deletions packages/scripts/algolia/addDictionariesToIndex.ts
Original file line number Diff line number Diff line change
@@ -1,41 +1,39 @@
import { db } from '../config';
import { updateIndex } from './algolia';
import { ActualDatabaseEntry } from '@living-dictionaries/types';
import type { ActualDatabaseEntry } from '@living-dictionaries/types'
import * as prepare from '@living-dictionaries/functions/src/algolia/prepareDataForIndex'
import { db } from '../config-firebase'
import { updateIndex } from './algolia'

// import { prepareDataForIndex } from '@living-dictionaries/functions/src/algolia/prepareDataForIndex';
import * as prepare from '@living-dictionaries/functions/src/algolia/prepareDataForIndex';
// @ts-ignore
// @ts-expect-error
const prepareDataForIndex = prepare.default
.prepareDataForIndex as typeof import('@living-dictionaries/functions/src/algolia/prepareDataForIndex').prepareDataForIndex; // b/c file is declared to be commonjs by its package.json
.prepareDataForIndex as typeof import('@living-dictionaries/functions/src/algolia/prepareDataForIndex').prepareDataForIndex // b/c file is declared to be commonjs by its package.json

const indexAllDictionaries = async () => {
const dictionariesSnapshot = await db.collection(`dictionaries`).get();
const dictionaryIds = dictionariesSnapshot.docs.map((doc) => doc.id);
console.log(dictionaryIds);
process.stdout.write(dictionaryIds + '\n');
async function indexAllDictionaries() {
const dictionariesSnapshot = await db.collection(`dictionaries`).get()
const dictionaryIds = dictionariesSnapshot.docs.map(doc => doc.id)
console.log(dictionaryIds)
process.stdout.write(`${dictionaryIds}\n`)

for (const dictionaryId of dictionaryIds)
await indexDictionary(dictionaryId);

};
await indexDictionary(dictionaryId)
}

async function indexDictionary(dictionaryId: string) {
const entriesSnapshot = await db.collection(`dictionaries/${dictionaryId}/words`).get();
const entries = await prepareEntriesFromSnapshot(entriesSnapshot, dictionaryId);
await updateIndex(entries);
const entriesSnapshot = await db.collection(`dictionaries/${dictionaryId}/words`).get()
const entries = await prepareEntriesFromSnapshot(entriesSnapshot, dictionaryId)
await updateIndex(entries)
}

// eslint-disable-next-line no-undef
async function prepareEntriesFromSnapshot(entriesSnapshot: FirebaseFirestore.QuerySnapshot<FirebaseFirestore.DocumentData>, dictionaryId: string) {
const entryPromises = entriesSnapshot.docs.map(async (doc) => {
const dbEntry = doc.data() as ActualDatabaseEntry;
const algoliaEntry = await prepareDataForIndex(dbEntry, dictionaryId, db);
console.log({ dbEntry, algoliaEntry});
return { ...algoliaEntry, objectID: doc.id };
});

const entries = await Promise.all(entryPromises);
return entries;
const dbEntry = doc.data() as ActualDatabaseEntry
const algoliaEntry = await prepareDataForIndex(dbEntry, dictionaryId, db)
console.log({ dbEntry, algoliaEntry })
return { ...algoliaEntry, objectID: doc.id }
})

const entries = await Promise.all(entryPromises)
return entries
}

// indexAllDictionaries();
Expand Down
31 changes: 15 additions & 16 deletions packages/scripts/algolia/algolia.ts
Original file line number Diff line number Diff line change
@@ -1,32 +1,31 @@
import algoliasearch from 'algoliasearch';
import { projectId } from '../config';
import { adminKey } from './algolia-admin-key.json';
import { AlgoliaEntry } from '@living-dictionaries/types';
import algoliasearch from 'algoliasearch'
import type { AlgoliaEntry } from '@living-dictionaries/types'
import { projectId } from '../config-firebase'
import { adminKey } from './algolia-admin-key.json'

const ALGOLIA_APP_ID = 'XCVBAYSYXD';
const ALGOLIA_APP_ID = 'XCVBAYSYXD'

export const client = algoliasearch(ALGOLIA_APP_ID, adminKey);
export const client = algoliasearch(ALGOLIA_APP_ID, adminKey)

const index = client.initIndex(
projectId === 'talking-dictionaries-dev' ? 'entries_dev' : 'entries_prod'
);
projectId === 'talking-dictionaries-dev' ? 'entries_dev' : 'entries_prod',
)

const MAX_CHUNK_SIZE = 3000;
const MAX_CHUNK_SIZE = 3000
// https://www.algolia.com/doc/api-reference/api-methods/add-objects/#examples
// if forced to iterate instead of save all at once, take note of the rate limiting at 5000 backlogged requests https://www.algolia.com/doc/faq/indexing/is-there-a-rate-limit/

export async function updateIndex(entries: AlgoliaEntry[]) {
try {
for (let startOfChunkIndex = 0; startOfChunkIndex < entries.length; startOfChunkIndex += MAX_CHUNK_SIZE) {
const endOfChunk = startOfChunkIndex + MAX_CHUNK_SIZE;
const chunk = entries.slice(startOfChunkIndex, endOfChunk);
console.log({ startOfChunkIndex, endOfChunk, CHUNK_SIZE: MAX_CHUNK_SIZE, chunkLength: chunk.length });
const endOfChunk = startOfChunkIndex + MAX_CHUNK_SIZE
const chunk = entries.slice(startOfChunkIndex, endOfChunk)
console.log({ startOfChunkIndex, endOfChunk, CHUNK_SIZE: MAX_CHUNK_SIZE, chunkLength: chunk.length })

const { objectIDs } = await index.saveObjects(chunk);
console.log(`Entries indexed: ${objectIDs.length}`);
const { objectIDs } = await index.saveObjects(chunk)
console.log(`Entries indexed: ${objectIDs.length}`)
}
} catch (err) {
console.log(err);
console.log(err)
}

}
Loading

0 comments on commit 3f4890a

Please sign in to comment.