目录

构建word-to-embedding

目录

构建word to embedding

from torch.utils.data import Dataset, DataLoader
import numpy as np

def readfile(path, embeding):
    with open(path, "r", encoding = "utf-8") as file:
        all_data = file.read().split("\n")

    word_embeding = {"UNK": np.random.normal(size = (embeding, ))}
    for data in all_data:
        for word in data:
            if word not in word_embeding.keys():
                word_embeding[word] = np.random.normal(size = (embeding, ))

    return all_data, word_embeding

class MyDataset(Dataset):
    def __init__(self,data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return self.data[item]



if __name__ == "__main__":
    path = "D:前50行.txt"
    embeding = 50
    all_data, word_embeding = readfile(path, embeding)

    dataset = MyDataset(all_data)
    dataloader = DataLoader( dataset)
    for data in dataloader:
        for words in data:
            for word in words:
                print(word_embeding[word])