Files
2024-08-24 10:56:04 +05:30

94 lines
2.5 KiB
Python

import json
import re
from pathlib import Path
from labml import logger
from labml.logger import Text
HOME = Path('./labml_nn').absolute()
print(HOME)
REGEX = re.compile(r"""
\(
https://arxiv\.org/abs/ # Start of a numeric entity reference
(?P<id>[0-9\.]+) # Paper ID
\)
""", re.VERBOSE)
IGNORE = {
'neox/model.html',
'transformers/index.html',
'transformers/configs.html',
'optimizers/noam.html',
'transformers/basic/autoregressive_experiment.html',
'transformers/xl/relative_mha.html',
'capsule_networks/mnist.html',
'transformers/rope/value_pe/index.html',
}
IGNORE_PAPERS = {
'2002.04745', # On Layer Normalization in the Transformer Architecture
'1606.08415', # Gaussian Error Linear Units (GELUs)
'1710.10196', # Progressive Growing of GANs for Improved Quality, Stability, and Variation
'1904.11486', # Making Convolutional Networks Shift-Invariant Again
'1801.04406', # Which Training Methods for GANs do actually Converge?
'1812.04948', # A Style-Based Generator Architecture for Generative Adversarial Networks
'1705.10528', # Constrained Policy Optimization
}
def collect(path: Path):
if path.is_file():
html = path.relative_to(HOME)
if html.suffix not in {'.py'}:
return []
if html.stem == '__init__':
html = html.parent / 'index.html'
else:
html = html.parent / f'{html.stem}.html'
if str(html) in IGNORE:
return []
with open(str(path), 'r') as f:
contents = f.read()
papers = set()
for m in REGEX.finditer(contents):
if m.group('id') in IGNORE_PAPERS:
continue
papers.add(m.group('id'))
if len(papers) > 1:
logger.log([(str(html), Text.key), ': ', str(papers)])
return [{'url': str(html), 'arxiv_id': p} for p in papers]
urls = []
for f in path.iterdir():
urls += collect(f)
return urls
def main():
papers = []
for f in HOME.iterdir():
papers += collect(f)
papers.sort(key=lambda p: p['arxiv_id'])
by_id = {}
for p in papers:
if p['arxiv_id'] not in by_id:
by_id[p['arxiv_id']] = []
by_id[p['arxiv_id']].append(f'''https://nn.labml.ai/{p['url']}''')
logger.log([('Papers', Text.key), ': ', f'{len(by_id) :,}'])
with open(str(HOME.parent / 'docs' / 'papers.json'), 'w') as f:
f.write(json.dumps(by_id, indent=1))
if __name__ == '__main__':
main()