patternhelloworld
diff --git a/Diff for: ‎README.md
+29-9 b/Diff for: ‎README.md
+29-9
diff --git a/Diff for: ‎dist/bo/UrlNormalizer.js
+59-28 b/Diff for: ‎dist/bo/UrlNormalizer.js
+59-28
diff --git a/Diff for: ‎dist/service/EmailAreaService.js
+19-19 b/Diff for: ‎dist/service/EmailAreaService.js
+19-19
@@ -1,38 +1,58 @@
 # Url-knife [![NPM version](https://img.shields.io/npm/v/url-knife.svg)](https://www.npmjs.com/package/url-knife) [![](https://data.jsdelivr.com/v1/package/gh/patternknife/url-knife/badge)](https://www.jsdelivr.com/package/gh/patternknife/url-knife) [![](https://badgen.net/bundlephobia/minzip/url-knife)](https://bundlephobia.com/result?p=url-knife)
 ## Overview
-Extract and decompose (fuzzy) URLs (including emails, which are conceptually a part of URLs) in texts with robust patterns.
-
+Extract and decompose (fuzzy) URLs (including emails, which are conceptually a part of URLs) in texts with ``Area-Pattern-based modularity``.
 - This library is currently being refactored into TypeScript, as it was originally developed in JavaScript. 
 
 #### URL knife
 <a href="https://jsfiddle.net/AndrewKang/xtfjn8g3/" target="_blank">LIVE DEMO</a>
 
 
+## Area-Pattern-Based Modularity
+
+The **Area** represents a designated section of content, such as general text, XML (HTML) areas, URL areas, or EMAIL areas. Each **Area** is associated with a specific set of **Patterns** (regular expressions) tailored to its context.
+
+### Example:
+
+1. In a **TextArea** (general plain text), the system applies a URL-specific regular expression to extract potential URLs.
+2. Once the area is narrowed down to contain URLs, **UrlArea** logic is used, applying URL-specific patterns to decompose the URL into its components (e.g., protocol, domain, path, query parameters).
+
+### Enhanced Accuracy with Regular Expression Indexes:
+To further improve accuracy, the system leverages the **index** (or **offset**) values from regular expressions. These indexes help pinpoint exact locations of matches within the text, ensuring precise extraction and minimizing false positives.
+
+For example:
+- If a **CommentArea** is processed using its specific patterns, the system identifies indexes for matches within that area.
+- These indexes can then be used to exclude matched URLs from a broader **TextArea**, ensuring only relevant URLs are processed and avoiding redundant or incorrect extractions.
+
+### Key Benefits:
+This modular approach ensures that each **Area** is processed efficiently with the most relevant and optimized regular expressions. By incorporating index-based matching, it enables robust, scalable, and highly accurate parsing for various content types while preventing conflicts between overlapping patterns.
+
+
 ## Installation
 
-For ES5 users,
+For ES5 users, refer to ``public/index.html``.
 
 ``` html
 <html>
        <body>
        	<script src="../dist/url-knife.bundle.js"></script>
         <--! OR !-->
-       	<script src="https://cdn.jsdelivr.net/gh/patternknife/[email protected]/dist/url-knife.bundle.min.js"></script>
-       	
-       	<script type="text/javascript">
-       	</script>
+       	<script src="https://cdn.jsdelivr.net/gh/patternknife/[email protected]/dist/url-knife.bundle.min.js"></script> 	
        </body>
 </html>
 ```
 
-For ES6 npm users, run 'npm install --save url-knife' on console.
+For ES6 npm users, run 'npm install --save url-knife' in the console.
 (**Requred Node v18.20.4**)
-
 ``` html
 import {TextArea, UrlArea, XmlArea} from 'url-knife';
 ```
+For ES5, add Pattern before usage:
+```javascript
+Pattern.UrlArea...
+````
 
 ## Syntax & Usage
+
 [Chapter 1. Normalize or parse one URL](#chapter-1-normalize-or-parse-one-url)
 
 [Chapter 2. Extract all URLs or emails](#chapter-2-extract-all-urls-or-emails)
 
@@ -9,17 +9,40 @@ const FuzzyPartialUrlPatterns_1 = require("../pattern/FuzzyPartialUrlPatterns");
 const BasePatterns_1 = require("../pattern/BasePatterns");
 const ProtocolPatterns_1 = require("../pattern/ProtocolPatterns");
 const DomainPatterns_1 = require("../pattern/DomainPatterns");
+const valid_1 = __importDefault(require("../valid"));
 exports.UrlNormalizer = {
-    modifiedUrl: null,
+    sacrificedUrl: null,
+    currentStep: 0,
+    /**
+     * Initializes the UrlNormalizer with a given URL.
+     * @param url - The URL to normalize.
+     */
+    initializeSacrificedUrl(url) {
+        this.sacrificedUrl = util_1.default.Text.removeAllSpaces(valid_1.default.validateAndTrimString(url));
+        if (!this.sacrificedUrl) {
+            throw new Error("modifiedUrl cannot be null or empty");
+        }
+        this.currentStep = 1;
+    },
+    /**
+     * Check if the required previous step is completed.
+     * @param requiredStep - The step that should have been completed.
+     */
+    ensureStepCompleted(requiredStep) {
+        if (this.currentStep != requiredStep) {
+            throw new Error(`Step ${requiredStep} must be completed before this step ${this.currentStep}`);
+        }
+    },
     extractAndNormalizeProtocolFromSpacesRemovedUrl() {
-        if (this.modifiedUrl == undefined) {
-            throw new Error("modifiedUrl cannot be null");
+        this.ensureStepCompleted(1);
+        if (!this.sacrificedUrl) {
+            throw new Error("modifiedUrl cannot be null or empty");
         }
         let protocol = null;
         let rx = new RegExp('^(' + FuzzyPartialUrlPatterns_1.FuzzyPartialUrlPatterns.getFuzzyProtocolsRxStr + '|' + FuzzyPartialUrlPatterns_1.FuzzyPartialUrlPatterns.fuzzierProtocol + ')' + FuzzyPartialUrlPatterns_1.FuzzyPartialUrlPatterns.fuzzierProtocolDomainDelimiter);
         let match;
         let isMatched = false;
-        while ((match = rx.exec(this.modifiedUrl)) !== null) {
+        while ((match = rx.exec(this.sacrificedUrl)) !== null) {
             if (match && match[1]) {
                 isMatched = true;
                 if (match[1] === 'localhost') {
@@ -37,11 +60,13 @@ exports.UrlNormalizer = {
                 break;
             }
         }
-        this.modifiedUrl = this.modifiedUrl.replace(rx, '');
+        this.sacrificedUrl = this.sacrificedUrl.replace(rx, '');
+        this.currentStep = 2;
         return protocol;
     },
     extractAndNormalizeDomainFromProtocolRemovedUrl() {
-        if (this.modifiedUrl == undefined) {
+        this.ensureStepCompleted(2);
+        if (this.sacrificedUrl == undefined) {
             throw new Error("modifiedUrl cannot be null");
         }
         let result = {
@@ -51,7 +76,7 @@ exports.UrlNormalizer = {
         let rx1 = new RegExp('(' + FuzzyPartialUrlPatterns_1.FuzzyPartialUrlPatterns.getFuzzyDomainBody + '.*?)(' + FuzzyPartialUrlPatterns_1.FuzzyPartialUrlPatterns.optionalFuzzyPort +
             FuzzyPartialUrlPatterns_1.FuzzyPartialUrlPatterns.optionalFuzzyUrlParams + ')$', 'gi');
         let match1;
-        while ((match1 = rx1.exec(this.modifiedUrl)) !== null) {
+        while ((match1 = rx1.exec(this.sacrificedUrl)) !== null) {
             // remaining full url
             let domain_temp = match1[0];
             // domain
@@ -141,46 +166,49 @@ exports.UrlNormalizer = {
             else {
                 result.domain = domain_temp2;
             }
-            this.modifiedUrl = domain_temp3;
+            this.sacrificedUrl = domain_temp3;
         }
         //console.log("before : " + this.modifiedUrl)
         // This sort of characters should NOT be located at the start.
-        this.modifiedUrl = this.modifiedUrl.replace(new RegExp('^(?:' + BasePatterns_1.BasePatterns.twoBytesNum + '|' + BasePatterns_1.BasePatterns.langChar + ')+', 'i'), '');
-        //console.log("after : " + this.modifiedUrl)
+        this.sacrificedUrl = this.sacrificedUrl.replace(new RegExp('^(?:' + BasePatterns_1.BasePatterns.twoBytesNum + '|' + BasePatterns_1.BasePatterns.langChar + ')+', 'i'), '');
+        this.currentStep = 3;
         return result;
     },
     extractAndNormalizePortFromDomainRemovedUrl() {
+        this.ensureStepCompleted(3);
         let port = null;
         let rx = new RegExp('^' + FuzzyPartialUrlPatterns_1.FuzzyPartialUrlPatterns.mandatoryFuzzyPort, 'gi');
         let match;
-        if (this.modifiedUrl == undefined) {
+        if (this.sacrificedUrl == undefined) {
             throw new Error("modifiedUrl cannot be null");
         }
-        while ((match = rx.exec(this.modifiedUrl)) !== null) {
+        while ((match = rx.exec(this.sacrificedUrl)) !== null) {
             port = match[0].replace(/^\D+/g, '');
-            if (this.modifiedUrl != undefined) {
-                this.modifiedUrl = this.modifiedUrl.replace(rx, '');
+            if (this.sacrificedUrl != undefined) {
+                this.sacrificedUrl = this.sacrificedUrl.replace(rx, '');
             }
         }
+        this.currentStep = 4;
         return port;
     },
-    finalizeNormalization(protocol, port, domain) {
-        if (this.modifiedUrl == undefined) {
+    extractNormalizedUrl(protocol, port, domain) {
+        this.ensureStepCompleted(4);
+        if (this.sacrificedUrl == undefined) {
             throw new Error("modifiedUrl cannot be null");
         }
         /* Now, only the end part of a domain is left */
         /* Consecutive param delimiters should be replaced into one */
-        this.modifiedUrl = this.modifiedUrl.replace(/[#]{2,}/gi, '#');
-        this.modifiedUrl = this.modifiedUrl.replace(/[/]{2,}/gi, '/');
-        this.modifiedUrl = this.modifiedUrl.replace(/(.*?)[?]{2,}([^/]*?(?:=|$))(.*)/i, function (match, $1, $2, $3) {
+        this.sacrificedUrl = this.sacrificedUrl.replace(/[#]{2,}/gi, '#');
+        this.sacrificedUrl = this.sacrificedUrl.replace(/[/]{2,}/gi, '/');
+        this.sacrificedUrl = this.sacrificedUrl.replace(/(.*?)[?]{2,}([^/]*?(?:=|$))(.*)/i, function (match, $1, $2, $3) {
             //console.log(modified_url + ' a :' + $1 + '?' + $2 + $3);
             return $1 + '?' + $2 + $3;
         });
         /* 'modified_url' must start with '/,?,#' */
         let rx_modified_url = new RegExp('(?:\\/|\\?|\\#)', 'i');
         let match_modified_url;
-        if ((match_modified_url = rx_modified_url.exec(this.modifiedUrl)) !== null) {
-            this.modifiedUrl = this.modifiedUrl.replace(new RegExp('^.*?(' + util_1.default.Text.escapeRegex(match_modified_url[0]) + '.*)$', 'i'), function (match, $1) {
+        if ((match_modified_url = rx_modified_url.exec(this.sacrificedUrl)) !== null) {
+            this.sacrificedUrl = this.sacrificedUrl.replace(new RegExp('^.*?(' + util_1.default.Text.escapeRegex(match_modified_url[0]) + '.*)$', 'i'), function (match, $1) {
                 return $1;
             });
         }
@@ -202,42 +230,45 @@ exports.UrlNormalizer = {
         if (!onlyDomain_str) {
             onlyDomain_str = '';
         }
-        return protocol_str + onlyDomain_str + port_str + this.modifiedUrl;
+        this.currentStep = 5;
+        return protocol_str + onlyDomain_str + port_str + this.sacrificedUrl;
     },
     extractAndNormalizeUriParamsFromPortRemovedUrl() {
-        if (this.modifiedUrl == undefined) {
+        this.ensureStepCompleted(5);
+        if (this.sacrificedUrl == undefined) {
             throw new Error("modifiedUrl cannot be null");
         }
         let result = {
             uri: null,
             params: null
         };
-        if (!this.modifiedUrl || this.modifiedUrl.trim() === '') {
+        if (!this.sacrificedUrl || this.sacrificedUrl.trim() === '') {
             result.params = null;
             result.uri = null;
         }
         else {
             // PARAMS
             let rx3 = new RegExp('\\?(?:.)*$', 'gi');
             let match3;
-            while ((match3 = rx3.exec(this.modifiedUrl)) !== null) {
+            while ((match3 = rx3.exec(this.sacrificedUrl)) !== null) {
                 result.params = match3[0];
             }
-            this.modifiedUrl = this.modifiedUrl.replace(rx3, '');
+            this.sacrificedUrl = this.sacrificedUrl.replace(rx3, '');
             if (result.params === "?") {
                 result.params = null;
             }
             // URI
             let rx4 = new RegExp('[#/](?:.)*$', 'gi');
             let match4;
-            while ((match4 = rx4.exec(this.modifiedUrl)) !== null) {
+            while ((match4 = rx4.exec(this.sacrificedUrl)) !== null) {
                 result.uri = match4[0];
             }
-            this.modifiedUrl = this.modifiedUrl.replace(rx4, '');
+            this.sacrificedUrl = this.sacrificedUrl.replace(rx4, '');
             if (result.uri === "/") {
                 result.uri = null;
             }
         }
+        this.currentStep = 6;
         return result;
     }
 };
@@ -10,7 +10,7 @@ const BasePatterns_1 = require("../pattern/BasePatterns");
 const DomainPatterns_1 = require("../pattern/DomainPatterns");
 exports.EmailAreaService = {
     parseEmail(email) {
-        let obj = {
+        let parsedEmailComponents = {
             email: null,
             removedTailOnEmail: null,
             type: null
@@ -21,49 +21,49 @@ exports.EmailAreaService = {
             if (!valid_1.default.isEmailPattern(email)) {
                 throw new Error('This is not an email pattern');
             }
-            obj.email = email;
+            parsedEmailComponents.email = email;
             if (new RegExp('@' + BasePatterns_1.BasePatterns.everything + '*' + DomainPatterns_1.DomainPatterns.ipV4, 'i').test(email)) {
-                obj.type = 'ipV4';
+                parsedEmailComponents.type = 'ipV4';
             }
             else if (new RegExp('@' + BasePatterns_1.BasePatterns.everything + '*' + DomainPatterns_1.DomainPatterns.ipV6, 'i').test(email)) {
                 //console.log('r : ' + url);
-                obj.type = 'ipV6';
+                parsedEmailComponents.type = 'ipV6';
             }
             else {
-                obj.type = 'domain';
+                parsedEmailComponents.type = 'domain';
             }
             // If no uris no params, we remove suffix in case that it is a meta character.
-            if (obj.email) {
-                if (obj.type !== 'ipV6') {
+            if (parsedEmailComponents.email) {
+                if (parsedEmailComponents.type !== 'ipV6') {
                     // removedTailOnUrl
-                    let rm_part_matches = obj.email.match(new RegExp(BasePatterns_1.BasePatterns.noLangCharNum + '+$', 'gi'));
+                    let rm_part_matches = parsedEmailComponents.email.match(new RegExp(BasePatterns_1.BasePatterns.noLangCharNum + '+$', 'gi'));
                     if (rm_part_matches) {
-                        obj.removedTailOnEmail = rm_part_matches[0];
-                        obj.email = obj.email.replace(new RegExp(BasePatterns_1.BasePatterns.noLangCharNum + '+$', 'gi'), '');
+                        parsedEmailComponents.removedTailOnEmail = rm_part_matches[0];
+                        parsedEmailComponents.email = parsedEmailComponents.email.replace(new RegExp(BasePatterns_1.BasePatterns.noLangCharNum + '+$', 'gi'), '');
                     }
                 }
                 else {
                     // removedTailOnUrl
-                    let rm_part_matches = obj.email.match(new RegExp('[^\\u005D]+$', 'gi'));
+                    let rm_part_matches = parsedEmailComponents.email.match(new RegExp('[^\\u005D]+$', 'gi'));
                     if (rm_part_matches) {
-                        obj.removedTailOnEmail = rm_part_matches[0];
-                        obj.email = obj.email.replace(new RegExp('[^\\u005D]+$', 'gi'), '');
+                        parsedEmailComponents.removedTailOnEmail = rm_part_matches[0];
+                        parsedEmailComponents.email = parsedEmailComponents.email.replace(new RegExp('[^\\u005D]+$', 'gi'), '');
                     }
                 }
                 // If no uri no params, we remove suffix in case that it is non-alphabets.
                 // The regex below means "all except for '.'". It is for extracting all root domains, so non-domain types like ip are excepted.
-                let onlyEnd = obj.email.match(new RegExp('[^.]+$', 'gi'));
+                let onlyEnd = parsedEmailComponents.email.match(new RegExp('[^.]+$', 'gi'));
                 if (onlyEnd && onlyEnd.length > 0) {
                     // this is a root domain only in English like com, ac
                     // but the situation is like com가, ac나
                     if (/^[a-zA-Z]+/.test(onlyEnd[0])) {
-                        if (/[^a-zA-Z]+$/.test(obj.email)) {
+                        if (/[^a-zA-Z]+$/.test(parsedEmailComponents.email)) {
                             // remove non alphabets
-                            const matchedEmail = obj.email.match(/[^a-zA-Z]+$/);
+                            const matchedEmail = parsedEmailComponents.email.match(/[^a-zA-Z]+$/);
                             if (matchedEmail && matchedEmail.length > 0) {
-                                obj.removedTailOnEmail = matchedEmail[0] + obj.removedTailOnEmail;
+                                parsedEmailComponents.removedTailOnEmail = matchedEmail[0] + parsedEmailComponents.removedTailOnEmail;
                             }
-                            obj.email = obj.email.replace(/[^a-zA-Z]+$/, '');
+                            parsedEmailComponents.email = parsedEmailComponents.email.replace(/[^a-zA-Z]+$/, '');
                         }
                     }
                 }
@@ -73,7 +73,7 @@ exports.EmailAreaService = {
             console.log(e);
         }
         finally {
-            return obj;
+            return parsedEmailComponents;
         }
     },
     strictTest(email) {