update

aaron.liu · aaron.liu · commit 6d56ddfd6bc3 · 2024-07-31T00:39:58.000-07:00
diff --git a/DataStructure/Linkedin All O1.py b/DataStructure/Linkedin All O1.py
@@ -0,0 +1,124 @@
+'''
+Implement interfaces provided. All methods must run in O(1)
+'''
+
+class AllOne:
+    # bucket of counts [(count, [list of vals])]
+    # map of key: index
+    def __init__(self):
+        self.minCount = 0
+        self.maxCount = 0
+        self.bucket = LinkedList()
+        self.mapper = {} # map keys to nodes
+
+    def inc(self, key: str) -> None:
+        node = self.bucket.head if key not in self.mapper else self.mapper[key]
+        new_node = self.bucket.incrementElement(node, key)
+        self.mapper[key] = new_node
+
+    def dec(self, key: str) -> None:
+        node = self.mapper[key]
+        new_node = self.bucket.decrementElement(node, key)
+        if new_node.count == 0:
+            new_node.removeElement(key)
+            del self.mapper[key]
+        else:
+            self.mapper[key] = new_node
+
+    def getMaxKey(self) -> str:
+        max_node = self.bucket.tail.prev
+        if max_node.count == 0:
+            return ""
+        for e in max_node.keys:
+            return e
+
+    def getMinKey(self) -> str:
+        min_node = self.bucket.head.next
+        if min_node.count < float("inf"):
+            for e in min_node.keys:
+                return e
+        else:
+            return ""
+    
+
+class LinkedList:
+    def __init__(self):
+        self.head = Node(0)
+        self.tail = Node(float("inf"))
+        self.head.next = self.tail
+        self.tail.prev = self.head
+
+    def decrementElement(self, node, key):
+        node.removeElement(key)
+        prevNode = node.prev
+        newCount = node.count-1
+        correctNode = None
+        if prevNode.count == newCount:
+            # 2 nodes are consecutive in count, add directly
+            correctNode = prevNode
+        else:
+            # insert new node
+            correctNode = self.insertNext(prevNode, newCount)
+        
+        if not node.keys:
+            self.pop(node)
+
+        correctNode.addElement(key)
+        return correctNode
+
+    def incrementElement(self, node, key):
+        if node.count != 0:
+            node.removeElement(key)
+        newCount = node.count + 1
+        correctNode = None
+        if node.next.count == newCount:
+            correctNode = node.next
+        else:
+            correctNode = self.insertNext(node, newCount)
+
+        correctNode.addElement(key)
+        if not node.keys:
+            self.pop(node)
+        
+        return correctNode
+
+
+    def insertNext(self, node, count):
+        # insert a node in front of this node with count=count
+        nextNode = node.next
+
+        new_node = Node(count)
+        new_node.prev = node
+        new_node.next = nextNode
+
+        node.next = new_node
+        nextNode.prev = new_node
+
+        return new_node
+
+    def pop(self, node):
+        # pop the node if it has no keys
+        if node.count == 0 or node.count == float("inf"):
+            return
+
+        prev = node.prev
+        nextNode = node.next
+
+        prev.next = nextNode
+        nextNode.prev = prev
+        return
+
+
+
+class Node:
+    def __init__(self, count):
+        self.keys = set()
+        self.count = count
+        self.prev = None
+        self.next = None
+
+    def removeElement(self, key):
+        self.keys.remove(key)
+
+    def addElement(self, key):
+        self.keys.add(key)
diff --git a/DataStructure/Linkedin Retain Best Cache.py b/DataStructure/Linkedin Retain Best Cache.py
@@ -30,10 +30,13 @@ def __init__(self, data_source: DataSource, entries_to_retain: int):
         self.data_source = data_source
         self.cache_size = entries_to_retain
         self.cache = dict()
-        self.pq = heapq.heapify([])
-
-        pass
+        self.pq = []
 
     def get(self, key: K) -> Rankable:
-
-        pass
+        if key in self.cache:
+            return self.cache[key]
+        self.cache[key] = self.data_source.get(key)
+        heapq.heappush(self.pq, Item(key, self.cache[key]))
+        if len(self.pq) > self.cache_size:
+            del self.cache[heapq.heappop(self.pq)]
+        return self.cache[key]
diff --git a/ML/LinkedIn Decision Tree.py b/ML/LinkedIn Decision Tree.py
@@ -0,0 +1,111 @@
+'''
+design and implement a decision tree. focus on termination criteria and OOP.
+assume boolean features and boolean lable variables.
+
+'''
+import math
+from typing import List
+from collections import Counter
+
+class Feature:
+    def __init__(self, name:str = "", val:bool = False):
+        self.name = name
+        self.val = val
+    def __eq__(self, other) -> bool:
+        if isinstance(other, Feature):
+            return self.name == other.name
+        return False
+    def __hash__(self):
+        return hash(self.name)
+    
+class Instance:
+    def __init__(self, features:set[Feature] = set(), label:bool = False):
+        self.features = features
+        self.label = label
+
+# decision tree node defination
+class Node:
+    def __init__(self, left:'Node' = None, right:'Node' = None, feature:Feature = None):
+        self.left = left
+        self.right = right
+        self.feature = feature
+
+# init a descision tree and return the root of the tree. Note that this is a recursive function.
+def init_decision_tree(instances:List[Instance], features:set[Feature]) -> Node:
+    if not instances or not features:
+        return None
+    
+    # base case: 已经hit leaf node, 直接返回一个空root即可
+    if hit_termination_condition(instances):
+        return Node()
+    
+    max_gain_feature = find_max_gain_feature(instances, features)
+    left_instances, right_instances = split_instances(instances, max_gain_feature)
+    
+    # for features with binary label, it won't split on the same feature again. 
+    # it doesn't apply for dense features (dense means continuous) 
+    features.remove(max_gain_feature)
+
+    left_node = init_decision_tree(left_instances, features)
+    right_node = init_decision_tree(right_instances, features)
+    
+    return Node(left_node, right_node, max_gain_feature)
+
+    
+
+# 当instances里面所有label都一样时 可以结束split了 否则还需要接着split
+def hit_termination_condition(instances:List[Instance]) -> bool:
+    counter = Counter(ins.label for ins in instances)
+    return True if len(counter.keys()) == 1 else False # 以label为key
+
+
+# 计算可以获得最大收益的split feature
+def find_max_gain_feature(instances:List[Instance], features:set[Feature]) -> Feature:
+    max_gain = float('-inf')
+    ret = None
+    for feature in features:
+        cur_gain = calculate_info_gain(instances, feature)
+        if cur_gain > max_gain:
+            max_gain = cur_gain
+            ret = feature
+    return ret
+
+'''
+entropy = -( p*ln(p) + (1-p)*ln(1-p) )
+where ln is natural logarithm, and p is probability that the bool type feature equals to 1
+'''
+def calculate_entropy(instances:List[Instance], feature: Feature) -> float:
+    feat_cnt = 0
+    for ins in instances:
+        # count how many this target feature in instances
+        if feature in ins.features:
+            feat_cnt += 1
+    prob = feat_cnt / len(instances) # feature出现的概率
+    return - (prob * math.log(prob) + (1 - prob) * math.log(1 - prob))
+
+def split_instances(instances:List[Instance], target_feature:Feature) -> tuple[Instance]:
+    left_instances, right_instances = [], []
+    for ins in instances:
+        if target_feature in ins.features:
+            right_instances.append(ins)
+        else:
+            left_instances.append(ins)
+    return (left_instances, right_instances)
+
+'''
+calculate information gain per target feature split
+gain = paraent entropy - weighted avg of the child node's entropies.
+'''
+def calculate_info_gain(instances:List[Instance], feature:Feature) -> float:
+    cur_entropy = calculate_entropy(instances, feature)
+    
+    left_instances, right_instances = split_instances(instances, feature)
+    
+    left_entropy = calculate_entropy(left_instances, feature)
+    right_entropy = calculate_entropy(right_instances, feature)
+
+    left_weight = len(left_instances) / len(instances)
+    right_weight = len(right_instances) / len(instances)
+
+    ret = cur_entropy - left_weight * left_entropy - right_weight * right_entropy
+    return ret
diff --git a/Math/LinkedIn Gini coefficient.py b/Math/LinkedIn Gini coefficient.py
@@ -0,0 +1,45 @@
+'''
+Given a list of connection counts: [0,0,1,1,1,2,3,3,4,5]. connections[i] = x 代表第i个人有x个connections
+需要提出一个方法 可以计算出connections的"不均衡性". 
+
+可以提出使用: Gini coefficient:
+x-axis: cumulative percent of people
+y-axis: cumulative percent of coresponding number, connection number in this case.
+A formula: g = 1 - 2*integral(f), where f is the fucntion formed by y-x, should be given.
+'''
+
+from collections import defaultdict
+from typing import List
+
+def inequality_calculation(connections: List[int]) -> float:
+    counter = defaultdict(int)
+    mem_cnt = cnct_cnt = 0
+    min_num, max_num = float("inf"), float("-inf")
+
+    # 统计各个变量的值
+    for num in connections:
+        mem_cnt += 1                    # 总人数
+        cnct_cnt += num                 # 总connection数
+        counter[num] = counter[num] + 1 # 有num个connection的人的个数
+        min_num = min(min_num, num)     # 每次更新min/max connection num. 下面会用到
+        max_num = max(max_num, num)
+    
+    if cnct_cnt == 0: # edge case: there's no connects among all people. 下面循环会除cnct_cnt 所以这里要处理掉
+        return 0.0
+
+    ret = prev_cnct = 0
+    # 计算积分(integral of f). 画图可以看出 每个区间可以看成是一个"梯形"(trapezoid) 即可以转化成梯形面积求和
+    for cur_cnct in range(min_num, max_num + 1):
+        if cur_cnct not in counter:
+            continue
+        cur_mem = counter[cur_cnct]
+        # trapezoid area formula: (上底+下底)*高/2 注意这里的cnt都要除以总数 作为百分比参与计算
+        area = (prev_cnct + cur_cnct) / cnct_cnt * cur_mem / mem_cnt * 0.5
+        ret += area
+        prev_cnct += cur_cnct   # 要更新prev_cnct数用作下次计算
+    
+    return (1 - 2 * ret)
+
+connections = [0,0,1,1,1,2,3,3,4,5]
+# connections = [0,0,0,1,0,0,1,0,0,0,0,0,0,0]
+print(inequality_calculation(connections))
diff --git a/Math/LinkedIn Stratified Sampling.py b/Math/LinkedIn Stratified Sampling.py
@@ -0,0 +1,75 @@
+'''
+Given billions of training data in a file. Each data has one of 4 class labels.
+Output:
+select M_i data from each class, where i in {1, 2, ..., 4}
+
+Example: 
+feature in json format   lable
+{"user": 1, features}    Blue
+{"user": 2, features}    Red
+...
+
+M_i requires: {100 of Blue, 200 of Red, 300 of Green, 400 of Orange}
+
+optimal solution: resovior sampling
+其它可能解:
+1. assign a random number to each data, sort and take the top M. 
+ - works only if all data can be fit into memory
+ - time complexity O(nlogn) slower than reservior sampling O(n)
+
+2. Iterate thru all data and select each one with probability M/N.
+ - N is not guaranteed known or given in some cases, so it may require another loop to count N.
+ - since it takes data with probility M/N, so in the end it may not result exact M records. 
+'''
+from typing import List
+from collections import defaultdict
+import random
+
+class Instance:
+    def __init__(self, label: str = ""):
+        self.label =label
+
+class InstanceIterator:
+    def __init__(self, start, end):
+        self.cur = start
+        self.end = end
+    def has_next() -> bool:
+        pass
+    def next() -> Instance:
+        pass
+    
+def sampling(iterator: InstanceIterator, requirement: dict[str, int]) -> dict[str, List[Instance]]:
+    ret = defaultdict(list)
+    counter = defaultdict(int)
+
+    while iterator.has_next():
+        cur_ins = iterator.next()
+        cur_label = cur_ins.label
+
+        cur_cnt = len(ret[cur_label])
+        cur_num = counter[cur_label]
+
+        if cur_cnt < requirement[cur_label]:  # 蓄水池还没满 直接放到对应的list里面
+            ret[cur_label].append(cur_ins)
+        else:                                 # 蓄水池满 生成一个[0,cur_num]之间的索引j   
+            idx = random.randint(0, cur_num)
+            if idx < requirement[cur_label]:  # 如果j小于threshold 用当前元素替换下标为j的元素
+                ret[cur_label][idx] = cur_ins
+        
+        counter[cur_label] = cur_num + 1      # 更新cur_num 
+    
+    return ret
+
+'''
+followup: how to do it using distributed computing framework like Hadoop?
+1. 从输入文件加载数据。
+2. 为每条记录生成一个随机数。
+3. 根据生成的随机数对记录进行排序。
+4. 从排序后的记录中选择前 M 条记录，作为最终的随机样本。
+----
+examples = load '$input' using PigStorage();
+examples = foreach examples generate id, RAMDOM(), as rnd;
+sample = order Examples by rnd;
+sample = limit sample $M;
+----
+'''
diff --git a/Stack/LC716MaxStack.py b/Stack/LC716MaxStack.py
@@ -1,3 +1,7 @@
+'''
+https://leetcode.com/problems/max-stack/description/    linked高频
+'''
+
 from collections import OrderedDict
 
 class Node:
diff --git a/Stack/Linkedin Max Stack.py b/Stack/Linkedin Max Stack.py
diff --git a/pip b/pip