tfidf 计算
imageimage
image
[输入]:
word_list = []
for i in range(len(corpus)):
word_list.append(corpus[i].split(' '))
print(word_list)
[输出]:
[['this', 'is', 'the', 'first', 'document'],
['this', 'is', 'the', 'second', 'second', 'document'],
['and', 'the', 'third', 'one'],
['is', 'this', 'the', 'first', 'document']]
[输入]:
countlist = []
for i in range(len(word_list)):
count = Counter(word_list[i])
countlist.append(count)
countlist
[输出]:
[Counter({'document': 1, 'first': 1, 'is': 1, 'the': 1, 'this': 1}),
Counter({'document': 1, 'is': 1, 'second': 2, 'the': 1, 'this': 1}),
Counter({'and': 1, 'one': 1, 'the': 1, 'third': 1}),
Counter({'document': 1, 'first': 1, 'is': 1, 'the': 1, 'this': 1})]
# word可以通过count得到,count可以通过countlist得到
# count[word]可以得到每个单词的词频, sum(count.values())得到整个句子的单词总数
def tf(word, count):
return count[word] / sum(count.values())
# 统计的是含有该单词的句子数
def n_containing(word, count_list):
return sum(1 for count in count_list if word in count)
# len(count_list)是指句子的总数,n_containing(word, count_list)是指含有该单词的句子的总数,加1是为了防止分母为0
def idf(word, count_list):
return math.log(len(count_list) / (1 + n_containing(word, count_list)))
# 将tf和idf相乘
def tfidf(word, count, count_list):
return tf(word, count) * idf(word, count_list)
auc计算
在有M个正样本,N个负样本的数据集里。一共有MN对样本(一对样本即,一个正样本与一个负样本)。统计这MN对样本里,正样本的预测概率大于负样本的预测概率的个数。
image
ID label pro
A 0 0.1
B 0 0.4
C 1 0.4
D 1 0.8
假设有4条样本。2个正样本,2个负样本,那么MN=4。即总共有4个样本对。分别是:
(D,B),(D,A),(C,B),(C,A)。
auc=(1+1+1+0.5)/22=0.875
下面这个代码不能处理pred相等时的情况,但是之前又一次笔试这么写的都过了样例
#include<iostream>
#include<algorithm>
#include<vector>
using namespace std;
struct node {
int label;
double pred;
}num[1010];
bool cmp(node x, node y)
{
return x.pred < y.pred;
}
int main() {
int n, neg = 0, pos = 0;
cin >> n;
for (int i = 0; i < n; i++)
{
cin >> num[i].label;
if (num[i].label == 1)pos++;
else neg++;
}
for (int i = 0; i < n; i++)
cin >> num[i].pred;
sort(num, num + n, cmp);
int cnt = 0, cntsum = 0;
for (int i = 0; i < n; i++)
{
if (num[i].label == 1)
cntsum += cnt;
else cnt++;
}
cout << cntsum * 1.0 / (pos*neg);
return 0;
}
/*
4
0 0 1 1
0.1 0.4 0.35 0.8
*/
二叉树:输出根节点到叶子的路径
#include<iostream>
#include<algorithm>
#include<vector>
using namespace std;
struct TreeNode {
int val;
TreeNode*right, *left;
};
void dfs(TreeNode*root, vector<int>&path, vector<vector<int>>&ans)
{
if (root == NULL)return;
path.push_back(root->val);
if (!root->left && !root->right) {
ans.push_back(path);
return;
}
if (root->left)dfs(root, path, ans);
if (root->right)dfs(root, path, ans);
}
int main() {
TreeNode*root;
vector<int>path;
vector<vector<int>>ans;
dfs(root, path, ans);
}
找到字串中合法匹配的连续对数
`#s = input()`
`s ``=` `'(())(()()()'`
`#s = '(())(()'`
`re ``=` `[]`
`dp ``=` `[``0``]`
`for` `i ``in` `s:`
`if` `not` `re:`
`re.append(i)`
`dp.append(``0``)`
`else``:`
`if` `i``=``=``'('``:`
`re.append(i)`
`dp.append(``0``)`
`else``:`
`if` `re[``-``1``]``=``=``'('``:`
`re.pop()`
`dp.append(dp.pop()``+``1``)`
`else``:`
`re.append(``')'``)`
`dp.append(``0``)`
`print``(re)`
`print``(dp)`
`m ``=` `0`
`cur ``=` `0`
`for` `i ``in` `dp:`
`if` `i!``=``0``:`
`cur``+``=``i`
`m ``=` `max``(cur,m)`
`else``:`
`cur ``=` `0`
`print``(m)`
题目:1-N的自然数中,少了一个,找出这个数
思路:
1、求和的思路。求出1-N的和,减去数列的总和,差即为这个数
public int findLost(int[] a,int n){
int sum = 0;
int sum1 = 0;
for(int i=0;i<n;i++){
sum+=a[i];
sum1+=i;
}
int res = sum1+n-sum;
return res;
}
1
2
3
4
5
6
7
8
9
10
2、用异或的方法。任何数异或自己都等于0,任何数异或0都等于他自己。异或两次即可
假如缺的为3。result = 1245N
第二次异或后 result = 1245N 12345N
= 0^3 = 3
public int findLost(int[] a,int n){
int result = 0;
for (int i = 0; i < a.length; i++) {
result = result ^ a[i];
}
for (int i = 0; i < n; i++) {
result=result^(i+1);
}
return result;
}
题目:1-N的自然数中,少了两个,找出这个数
输入N,求N的阶乘的末尾有多少个0
思路:n!= 123...n;我们要分析一下0是怎么来的,0是2*5得来的,那也就是说看有多少个2,5就可以了,
再分析,因子2出现的次数,2,4,6,8...,因子5出现的次数,5,10,15,25...
很显然,2出现的次数一定是比5出现的次数多的,那么我们只需要计算5出现的次数有多少,就可以得到会有多少个10,也就是会有多少个0了。所以思路就是,遍历1-n,求每个数中因子5的个数,加加加,就ok了。
求完全二叉树的最后一层的最后一个节点
BinaryTreeNode* getLastNode(BinaryTreeNode* root)
{
if(!root || (!root->left && !root->right))return root;
int depth = 0;
BinaryTreeNode* curNode = root;
while(curNode)//计算二叉树的深度
{
depth++;
curNode = curNode->left;
}
int level = 0,tmpDepth = 0;
while(root)
{
level++;//当前遍历到哪一层,跟节点是第一层
if(level == depth)break;//防止右子树为空
curNode = root;
if(curNode->right)
{
BinaryTreeNode* pCur = curNode;//当前节点的父节点
curNode = curNode->right;
tmpDepth = level + 1;
while(curNode->left)
{
tmpDepth++;
pCur = curNode;
curNode = curNode->left;
}
if(tmpDepth < depth)root = root -> left;//二分查找左子树
else if(!pCur->right || pCur->right == curNode)return curNode;
else root = root->right;//二分查找右子树
}
else root = root->left;
}
return root;
}
判断是否为完全二叉树
链表快排
ListNode* partition(ListNode*start, ListNode*end)
{
int num = start->val;
ListNode*p = start, *q = start->next;
while (q != end)
{
if (q->val < num) {
p = p->next;
swap(p->val, q->val);
}
q = q->next;
}
swap(p->val, start->val);
return p;
}
void quicksort(ListNode*start, ListNode*end) {
if (start != end)
{
ListNode*index = partition(start, end);
quicksort(start, index);
quicksort(index->next, end);
}
}
不同长度的绳子有不同的价值,一根绳子如何切分可以让总价值最大
https://www.cnblogs.com/tgycoder/p/4980655.html
int dfs(int p[], int n)
{
if (n == 0)return 0;
int profit = 0;
for (int i = 1; i <= n; i++)
profit = max(profit, p[i] + dfs(p, n - i));
return profit;
}
int dfs(int p[], int r[], int n)
{
if (r[n])return r[n];
int profit = 0;
for (int i = 1; i <= n; i++)
profit = max(profit, p[i] + dfs(p, r, n - i));
r[n] = profit;
return profit;
}
dijkstra
int G[maxn][maxn];
int d[maxn];
bool vis[maxn];
void dijkstra(int s)
{
fill(d, d + maxn, INF);
d[s] = 0;
while (1)
{
int u = -1, mind = INF;
for (int i = 0; i < n; i++)
if (!vis[i] && d[i] < mind)mind = d[i], u = i;
if (u == -1)break;
vis[u] = true;
for (int v = 0; v < n; v++)
{
if (!vis[v] && G[u][v] < INF)
d[v] = min(d[v], d[u] + G[u][v]);
}
}
}
数轴上的最长连续线段
struct Interval {
int start, end;
};
static bool cmp(Interval x, Interval y)
{
return x.start < y.start;
}
int merge(vector<Interval>intervals) {
vector<Interval>ans;
int maxlen = 0;
sort(intervals.begin(), intervals.end(), cmp);
ans.push_back(intervals[0]);
for (int i = 1; i < intervals.size(); i++)
{
if (intervals[i].start <= ans.back().end)
{
ans.back().start = min(ans.back().start, intervals[i].start);
ans.back().end = max(ans.back().end, intervals[i].end);
maxlen = max(maxlen, ans.back().end - ans.back().start - 1);
}
else if (intervals[i].start > ans.back().end)
{
ans.push_back(intervals[i]);
maxlen = max(maxlen, intervals[i].start-intervals[i].end);
}
}
return maxlen;
}
矩阵中的最短路,有门有钥匙。动态规划加状态向量
一个m*n矩阵,只能往左或往右,矩阵中的数字代表不同的权值,求一条路径,该路径的权值和与所给的target值最接近。
找子串在原串中第一次出现的位置
网友评论