Skip to content

Commit 6d56ddf

Browse files
author
aaron.liu
committed
update
1 parent 4298506 commit 6d56ddf

File tree

8 files changed

+468
-5
lines changed

8 files changed

+468
-5
lines changed

DataStructure/Linkedin All O1.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
'''
2+
Implement interfaces provided. All methods must run in O(1)
3+
'''
4+
5+
class AllOne:
6+
# bucket of counts [(count, [list of vals])]
7+
# map of key: index
8+
def __init__(self):
9+
self.minCount = 0
10+
self.maxCount = 0
11+
self.bucket = LinkedList()
12+
self.mapper = {} # map keys to nodes
13+
14+
def inc(self, key: str) -> None:
15+
node = self.bucket.head if key not in self.mapper else self.mapper[key]
16+
new_node = self.bucket.incrementElement(node, key)
17+
self.mapper[key] = new_node
18+
19+
def dec(self, key: str) -> None:
20+
node = self.mapper[key]
21+
new_node = self.bucket.decrementElement(node, key)
22+
if new_node.count == 0:
23+
new_node.removeElement(key)
24+
del self.mapper[key]
25+
else:
26+
self.mapper[key] = new_node
27+
28+
def getMaxKey(self) -> str:
29+
max_node = self.bucket.tail.prev
30+
if max_node.count == 0:
31+
return ""
32+
for e in max_node.keys:
33+
return e
34+
35+
def getMinKey(self) -> str:
36+
min_node = self.bucket.head.next
37+
if min_node.count < float("inf"):
38+
for e in min_node.keys:
39+
return e
40+
else:
41+
return ""
42+
43+
44+
class LinkedList:
45+
def __init__(self):
46+
self.head = Node(0)
47+
self.tail = Node(float("inf"))
48+
self.head.next = self.tail
49+
self.tail.prev = self.head
50+
51+
def decrementElement(self, node, key):
52+
node.removeElement(key)
53+
prevNode = node.prev
54+
newCount = node.count-1
55+
correctNode = None
56+
if prevNode.count == newCount:
57+
# 2 nodes are consecutive in count, add directly
58+
correctNode = prevNode
59+
else:
60+
# insert new node
61+
correctNode = self.insertNext(prevNode, newCount)
62+
63+
if not node.keys:
64+
self.pop(node)
65+
66+
correctNode.addElement(key)
67+
return correctNode
68+
69+
def incrementElement(self, node, key):
70+
if node.count != 0:
71+
node.removeElement(key)
72+
newCount = node.count + 1
73+
correctNode = None
74+
if node.next.count == newCount:
75+
correctNode = node.next
76+
else:
77+
correctNode = self.insertNext(node, newCount)
78+
79+
correctNode.addElement(key)
80+
if not node.keys:
81+
self.pop(node)
82+
83+
return correctNode
84+
85+
86+
def insertNext(self, node, count):
87+
# insert a node in front of this node with count=count
88+
nextNode = node.next
89+
90+
new_node = Node(count)
91+
new_node.prev = node
92+
new_node.next = nextNode
93+
94+
node.next = new_node
95+
nextNode.prev = new_node
96+
97+
return new_node
98+
99+
def pop(self, node):
100+
# pop the node if it has no keys
101+
if node.count == 0 or node.count == float("inf"):
102+
return
103+
104+
prev = node.prev
105+
nextNode = node.next
106+
107+
prev.next = nextNode
108+
nextNode.prev = prev
109+
return
110+
111+
112+
113+
class Node:
114+
def __init__(self, count):
115+
self.keys = set()
116+
self.count = count
117+
self.prev = None
118+
self.next = None
119+
120+
def removeElement(self, key):
121+
self.keys.remove(key)
122+
123+
def addElement(self, key):
124+
self.keys.add(key)

DataStructure/Linkedin Retain Best Cache.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,13 @@ def __init__(self, data_source: DataSource, entries_to_retain: int):
3030
self.data_source = data_source
3131
self.cache_size = entries_to_retain
3232
self.cache = dict()
33-
self.pq = heapq.heapify([])
34-
35-
pass
33+
self.pq = []
3634

3735
def get(self, key: K) -> Rankable:
38-
39-
pass
36+
if key in self.cache:
37+
return self.cache[key]
38+
self.cache[key] = self.data_source.get(key)
39+
heapq.heappush(self.pq, Item(key, self.cache[key]))
40+
if len(self.pq) > self.cache_size:
41+
del self.cache[heapq.heappop(self.pq)]
42+
return self.cache[key]

ML/LinkedIn Decision Tree.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
'''
2+
design and implement a decision tree. focus on termination criteria and OOP.
3+
assume boolean features and boolean lable variables.
4+
5+
'''
6+
import math
7+
from typing import List
8+
from collections import Counter
9+
10+
class Feature:
11+
def __init__(self, name:str = "", val:bool = False):
12+
self.name = name
13+
self.val = val
14+
def __eq__(self, other) -> bool:
15+
if isinstance(other, Feature):
16+
return self.name == other.name
17+
return False
18+
def __hash__(self):
19+
return hash(self.name)
20+
21+
class Instance:
22+
def __init__(self, features:set[Feature] = set(), label:bool = False):
23+
self.features = features
24+
self.label = label
25+
26+
# decision tree node defination
27+
class Node:
28+
def __init__(self, left:'Node' = None, right:'Node' = None, feature:Feature = None):
29+
self.left = left
30+
self.right = right
31+
self.feature = feature
32+
33+
# init a descision tree and return the root of the tree. Note that this is a recursive function.
34+
def init_decision_tree(instances:List[Instance], features:set[Feature]) -> Node:
35+
if not instances or not features:
36+
return None
37+
38+
# base case: 已经hit leaf node, 直接返回一个空root即可
39+
if hit_termination_condition(instances):
40+
return Node()
41+
42+
max_gain_feature = find_max_gain_feature(instances, features)
43+
left_instances, right_instances = split_instances(instances, max_gain_feature)
44+
45+
# for features with binary label, it won't split on the same feature again.
46+
# it doesn't apply for dense features (dense means continuous)
47+
features.remove(max_gain_feature)
48+
49+
left_node = init_decision_tree(left_instances, features)
50+
right_node = init_decision_tree(right_instances, features)
51+
52+
return Node(left_node, right_node, max_gain_feature)
53+
54+
55+
56+
# 当instances里面所有label都一样时 可以结束split了 否则还需要接着split
57+
def hit_termination_condition(instances:List[Instance]) -> bool:
58+
counter = Counter(ins.label for ins in instances)
59+
return True if len(counter.keys()) == 1 else False # 以label为key
60+
61+
62+
# 计算可以获得最大收益的split feature
63+
def find_max_gain_feature(instances:List[Instance], features:set[Feature]) -> Feature:
64+
max_gain = float('-inf')
65+
ret = None
66+
for feature in features:
67+
cur_gain = calculate_info_gain(instances, feature)
68+
if cur_gain > max_gain:
69+
max_gain = cur_gain
70+
ret = feature
71+
return ret
72+
73+
'''
74+
entropy = -( p*ln(p) + (1-p)*ln(1-p) )
75+
where ln is natural logarithm, and p is probability that the bool type feature equals to 1
76+
'''
77+
def calculate_entropy(instances:List[Instance], feature: Feature) -> float:
78+
feat_cnt = 0
79+
for ins in instances:
80+
# count how many this target feature in instances
81+
if feature in ins.features:
82+
feat_cnt += 1
83+
prob = feat_cnt / len(instances) # feature出现的概率
84+
return - (prob * math.log(prob) + (1 - prob) * math.log(1 - prob))
85+
86+
def split_instances(instances:List[Instance], target_feature:Feature) -> tuple[Instance]:
87+
left_instances, right_instances = [], []
88+
for ins in instances:
89+
if target_feature in ins.features:
90+
right_instances.append(ins)
91+
else:
92+
left_instances.append(ins)
93+
return (left_instances, right_instances)
94+
95+
'''
96+
calculate information gain per target feature split
97+
gain = paraent entropy - weighted avg of the child node's entropies.
98+
'''
99+
def calculate_info_gain(instances:List[Instance], feature:Feature) -> float:
100+
cur_entropy = calculate_entropy(instances, feature)
101+
102+
left_instances, right_instances = split_instances(instances, feature)
103+
104+
left_entropy = calculate_entropy(left_instances, feature)
105+
right_entropy = calculate_entropy(right_instances, feature)
106+
107+
left_weight = len(left_instances) / len(instances)
108+
right_weight = len(right_instances) / len(instances)
109+
110+
ret = cur_entropy - left_weight * left_entropy - right_weight * right_entropy
111+
return ret

Math/LinkedIn Gini coefficient.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
'''
2+
Given a list of connection counts: [0,0,1,1,1,2,3,3,4,5]. connections[i] = x 代表第i个人有x个connections
3+
需要提出一个方法 可以计算出connections的"不均衡性".
4+
5+
可以提出使用: Gini coefficient:
6+
x-axis: cumulative percent of people
7+
y-axis: cumulative percent of coresponding number, connection number in this case.
8+
A formula: g = 1 - 2*integral(f), where f is the fucntion formed by y-x, should be given.
9+
'''
10+
11+
from collections import defaultdict
12+
from typing import List
13+
14+
def inequality_calculation(connections: List[int]) -> float:
15+
counter = defaultdict(int)
16+
mem_cnt = cnct_cnt = 0
17+
min_num, max_num = float("inf"), float("-inf")
18+
19+
# 统计各个变量的值
20+
for num in connections:
21+
mem_cnt += 1 # 总人数
22+
cnct_cnt += num # 总connection数
23+
counter[num] = counter[num] + 1 # 有num个connection的人的个数
24+
min_num = min(min_num, num) # 每次更新min/max connection num. 下面会用到
25+
max_num = max(max_num, num)
26+
27+
if cnct_cnt == 0: # edge case: there's no connects among all people. 下面循环会除cnct_cnt 所以这里要处理掉
28+
return 0.0
29+
30+
ret = prev_cnct = 0
31+
# 计算积分(integral of f). 画图可以看出 每个区间可以看成是一个"梯形"(trapezoid) 即可以转化成梯形面积求和
32+
for cur_cnct in range(min_num, max_num + 1):
33+
if cur_cnct not in counter:
34+
continue
35+
cur_mem = counter[cur_cnct]
36+
# trapezoid area formula: (上底+下底)*高/2 注意这里的cnt都要除以总数 作为百分比参与计算
37+
area = (prev_cnct + cur_cnct) / cnct_cnt * cur_mem / mem_cnt * 0.5
38+
ret += area
39+
prev_cnct += cur_cnct # 要更新prev_cnct数用作下次计算
40+
41+
return (1 - 2 * ret)
42+
43+
connections = [0,0,1,1,1,2,3,3,4,5]
44+
# connections = [0,0,0,1,0,0,1,0,0,0,0,0,0,0]
45+
print(inequality_calculation(connections))

Math/LinkedIn Stratified Sampling.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
'''
2+
Given billions of training data in a file. Each data has one of 4 class labels.
3+
Output:
4+
select M_i data from each class, where i in {1, 2, ..., 4}
5+
6+
Example:
7+
feature in json format lable
8+
{"user": 1, features} Blue
9+
{"user": 2, features} Red
10+
...
11+
12+
M_i requires: {100 of Blue, 200 of Red, 300 of Green, 400 of Orange}
13+
14+
optimal solution: resovior sampling
15+
其它可能解:
16+
1. assign a random number to each data, sort and take the top M.
17+
- works only if all data can be fit into memory
18+
- time complexity O(nlogn) slower than reservior sampling O(n)
19+
20+
2. Iterate thru all data and select each one with probability M/N.
21+
- N is not guaranteed known or given in some cases, so it may require another loop to count N.
22+
- since it takes data with probility M/N, so in the end it may not result exact M records.
23+
'''
24+
from typing import List
25+
from collections import defaultdict
26+
import random
27+
28+
class Instance:
29+
def __init__(self, label: str = ""):
30+
self.label =label
31+
32+
class InstanceIterator:
33+
def __init__(self, start, end):
34+
self.cur = start
35+
self.end = end
36+
def has_next() -> bool:
37+
pass
38+
def next() -> Instance:
39+
pass
40+
41+
def sampling(iterator: InstanceIterator, requirement: dict[str, int]) -> dict[str, List[Instance]]:
42+
ret = defaultdict(list)
43+
counter = defaultdict(int)
44+
45+
while iterator.has_next():
46+
cur_ins = iterator.next()
47+
cur_label = cur_ins.label
48+
49+
cur_cnt = len(ret[cur_label])
50+
cur_num = counter[cur_label]
51+
52+
if cur_cnt < requirement[cur_label]: # 蓄水池还没满 直接放到对应的list里面
53+
ret[cur_label].append(cur_ins)
54+
else: # 蓄水池满 生成一个[0,cur_num]之间的索引j
55+
idx = random.randint(0, cur_num)
56+
if idx < requirement[cur_label]: # 如果j小于threshold 用当前元素替换下标为j的元素
57+
ret[cur_label][idx] = cur_ins
58+
59+
counter[cur_label] = cur_num + 1 # 更新cur_num
60+
61+
return ret
62+
63+
'''
64+
followup: how to do it using distributed computing framework like Hadoop?
65+
1. 从输入文件加载数据。
66+
2. 为每条记录生成一个随机数。
67+
3. 根据生成的随机数对记录进行排序。
68+
4. 从排序后的记录中选择前 M 条记录,作为最终的随机样本。
69+
----
70+
examples = load '$input' using PigStorage();
71+
examples = foreach examples generate id, RAMDOM(), as rnd;
72+
sample = order Examples by rnd;
73+
sample = limit sample $M;
74+
----
75+
'''

Stack/LC716MaxStack.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
'''
2+
https://leetcode.com/problems/max-stack/description/ linked高频
3+
'''
4+
15
from collections import OrderedDict
26

37
class Node:

0 commit comments

Comments
 (0)